feat: Improve email content extraction by enhancing header stripping and cleaning logic

2025-05-29 20:16:12 +02:00
parent 1756f3462b
commit e248e26b2d
2 changed files with 52 additions and 40 deletions
--- a/internal/imap/imap.go
+++ b/internal/imap/imap.go
@@ -295,7 +295,8 @@ func extractMessageContent(body string) string {
 			"error":      err,
 			"bodyLength": len(body),
 		}).Debug("Failed to parse email message, falling back to simple extraction")
-		return fallbackExtractContent(body)
+		// When ReadMessage fails, the body is raw, so header stripping is needed.
+		return cleanMessageContent(fallbackExtractContent(body), true)
 	}

 	contentTypeHeader := msg.Header.Get("Content-Type")
@@ -308,7 +309,8 @@ func extractMessageContent(body string) string {
 			"error":             err,
 			"contentTypeHeader": contentTypeHeader,
 		}).Debug("Failed to parse Content-Type header, falling back to simple extraction")
-		return fallbackExtractContent(body)
+		// When ParseMediaType fails, the body is raw, so header stripping is needed.
+		return cleanMessageContent(fallbackExtractContent(body), true)
 	}

 	logger.WithFields(logrus.Fields{
@@ -345,12 +347,15 @@ func extractMessageContent(body string) string {
 	if content == "" {
 		logger.Debug("No content extracted, falling back to simple extraction")
 		logger.Debug("extractMessageContent: No content from primary extraction, falling back.")
-		return fallbackExtractContent(body)
+		// When primary extraction yields no content, the body is raw, so header stripping is needed.
+		return cleanMessageContent(fallbackExtractContent(body), true)
 	}

 	// Clean up the content
 	logger.WithField("contentBeforeClean", content).Debug("extractMessageContent: Content before cleanMessageContent")
-	content = cleanMessageContent(content)
+	// When ReadMessage succeeds, 'content' is already the message body (or part body),
+	// so no further header stripping should be done.
+	content = cleanMessageContent(content, false)
 	logger.WithField("contentAfterClean", content).Debug("extractMessageContent: Content after cleanMessageContent")

 	logger.WithField("contentLength", len(content)).Debug("Successfully extracted and cleaned message content")
@@ -426,38 +431,46 @@ func handleSinglePartMessage(reader io.Reader) string {
 	return content
 }

-func cleanMessageContent(content string) string {
+func cleanMessageContent(content string, performHeaderStripping bool) string {
 	logger.WithField("inputContentLength", len(content)).Debug("cleanMessageContent: Starting")
-	// Remove any remaining email headers that might be in the body
-	lines := strings.Split(content, "\n")
-	var cleanLines []string
-	headerSection := true
+	logger.WithField("performHeaderStripping", performHeaderStripping).Debug("cleanMessageContent: performHeaderStripping flag")

-	for _, line := range lines {
-		trimmed := strings.TrimSpace(line)
+	if performHeaderStripping {
+		// Remove any remaining email headers that might be in the body
+		lines := strings.Split(content, "\n")
+		var cleanLines []string
+		headerSection := true

-		// Empty line marks the end of headers
-		if headerSection && trimmed == "" {
-			headerSection = false
-			continue
-		}
+		for _, line := range lines {
+			trimmed := strings.TrimSpace(line)

-		// Skip header lines
-		if headerSection && (strings.Contains(trimmed, ":") || trimmed == "") {
-			continue
-		}
+			// Empty line marks the end of headers
+			if headerSection && trimmed == "" {
+				headerSection = false
+				continue
+			}

-		// Add non-header lines
-		if !headerSection {
+			// Skip header lines
+			if headerSection && (strings.Contains(trimmed, ":") || trimmed == "") {
+				continue
+			}
+
+			// Add non-header lines
+			// This condition was originally !headerSection, but if we are past the headers,
+			// we should always add the line. If headerSection is still true here, it means
+			// it's the first line of content after potential headers were skipped.
 			cleanLines = append(cleanLines, line)
 		}
+		content = strings.Join(cleanLines, "\n")
+		logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
+	} else {
+		logger.Debug("cleanMessageContent: Skipping header stripping")
 	}

-	content = strings.Join(cleanLines, "\n")
-	logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
-
 	// Convert newlines to HTML breaks for display
-	content = strings.ReplaceAll(content, "\r\n", "<br>\n")
+	// First, normalize all newlines (\r\n or \n) to just \n
+	content = strings.ReplaceAll(content, "\r\n", "\n")
+	// Then, replace each \n with <br>\n
 	content = strings.ReplaceAll(content, "\n", "<br>\n")
 	logger.WithField("contentAfterNewlineConversionLength", len(content)).Debug("cleanMessageContent: Content after newline conversion")

@@ -474,26 +487,25 @@ func cleanMessageContent(content string) string {
 func fallbackExtractContent(body string) string {
 	logger.WithField("bodyLength", len(body)).Debug("Using fallback content extraction method")
 	logger.WithField("rawInputBodyFallbackLength", len(body)).Debug("fallbackExtractContent: Raw input body")
+	var content string
 	parts := strings.Split(body, "\r\n\r\n")
 	if len(parts) > 1 {
-		content := strings.Join(parts[1:], "\r\n\r\n")
-		content = strings.ReplaceAll(content, "\r\n", "<br>\n")
-		content = strings.ReplaceAll(content, "\n", "<br>\n")
+		content = strings.Join(parts[1:], "\r\n\r\n")
 		logger.WithFields(logrus.Fields{
 			"contentLength": len(content),
 			"partsCount":    len(parts),
-		}).Debug("Successfully extracted content using fallback method")
+		}).Debug("Successfully extracted content using fallback method (from parts)")
 		logger.WithField("extractedContentFallbackLength", len(content)).Debug("fallbackExtractContent: Content from splitting parts")
-		return content
+	} else {
+		content = body
+		logger.WithFields(logrus.Fields{
+			"contentLength": len(content),
+			"fullBody":      true,
+		}).Debug("Using full body as content in fallback method")
+		logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body (no parts split)")
 	}
-	content := body
-	content = strings.ReplaceAll(content, "\r\n", "<br>\n")
-	content = strings.ReplaceAll(content, "\n", "<br>\n")
-	logger.WithFields(logrus.Fields{
-		"contentLength": len(content),
-		"fullBody":      true,
-	}).Debug("Using full body as content in fallback method")
-	logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body")
+	// Newline conversion and signature stripping will be handled by cleanMessageContent,
+	// which is now called by the callers of fallbackExtractContent.
 	return content
 }