feat: Improve email content extraction by enhancing header stripping and cleaning logic

2025-05-29 20:16:12 +02:00
parent 1756f3462b
commit e248e26b2d
2 changed files with 52 additions and 40 deletions
--- a/internal/ai/ai.go
+++ b/internal/ai/ai.go
@@ -94,7 +94,7 @@ While the 'Email Body' provides the question, your answer should be synthesized
 Instructions:
 - Language: Your response must be entirely in %s, regardless of the language used in the email content or context.
- Format: Your reply must be raw HTML. Use appropriate HTML tags for structure and styling.
+- Format: CRITICAL: Your reply MUST be raw HTML. Use appropriate HTML tags for structure and styling. For example, wrap paragraphs in <p>...</p> tags and use <br> for line breaks if needed within a paragraph. Even a short sentence must be wrapped in HTML (e.g., <p>Yes.</p>).
 - Markdown: Do NOT wrap the HTML in markdown code blocks (e.g., %s).
 - Extraneous Text: Do not include a subject line. Do not include explanations, commentary, or any extra text that is not part of the direct answer.
 - Closing: Avoid generic closing statements like "If you have further questions...". Focus solely on answering the email.
--- a/internal/imap/imap.go
+++ b/internal/imap/imap.go
@@ -295,7 +295,8 @@ func extractMessageContent(body string) string {
 			"error":      err,
 			"bodyLength": len(body),
 		}).Debug("Failed to parse email message, falling back to simple extraction")
-		return fallbackExtractContent(body)
+		// When ReadMessage fails, the body is raw, so header stripping is needed.
 		return cleanMessageContent(fallbackExtractContent(body), true)
 	}
 	contentTypeHeader := msg.Header.Get("Content-Type")
@@ -308,7 +309,8 @@ func extractMessageContent(body string) string {
 			"error":             err,
 			"contentTypeHeader": contentTypeHeader,
 		}).Debug("Failed to parse Content-Type header, falling back to simple extraction")
-		return fallbackExtractContent(body)
+		// When ParseMediaType fails, the body is raw, so header stripping is needed.
 		return cleanMessageContent(fallbackExtractContent(body), true)
 	}
 	logger.WithFields(logrus.Fields{
@@ -345,12 +347,15 @@ func extractMessageContent(body string) string {
 	if content == "" {
 		logger.Debug("No content extracted, falling back to simple extraction")
 		logger.Debug("extractMessageContent: No content from primary extraction, falling back.")
-		return fallbackExtractContent(body)
+		// When primary extraction yields no content, the body is raw, so header stripping is needed.
 		return cleanMessageContent(fallbackExtractContent(body), true)
 	}
 	// Clean up the content
 	logger.WithField("contentBeforeClean", content).Debug("extractMessageContent: Content before cleanMessageContent")
-	content = cleanMessageContent(content)
+	// When ReadMessage succeeds, 'content' is already the message body (or part body),
 	// so no further header stripping should be done.
 	content = cleanMessageContent(content, false)
 	logger.WithField("contentAfterClean", content).Debug("extractMessageContent: Content after cleanMessageContent")
 	logger.WithField("contentLength", len(content)).Debug("Successfully extracted and cleaned message content")
@@ -426,8 +431,11 @@ func handleSinglePartMessage(reader io.Reader) string {
 	return content
 }
-func cleanMessageContent(content string) string {
+func cleanMessageContent(content string, performHeaderStripping bool) string {
 	logger.WithField("inputContentLength", len(content)).Debug("cleanMessageContent: Starting")
 	logger.WithField("performHeaderStripping", performHeaderStripping).Debug("cleanMessageContent: performHeaderStripping flag")
 	if performHeaderStripping {
 		// Remove any remaining email headers that might be in the body
 		lines := strings.Split(content, "\n")
 		var cleanLines []string
@@ -448,16 +456,21 @@ func cleanMessageContent(content string) string {
 			}
 			// Add non-header lines
-		if !headerSection {
+			// This condition was originally !headerSection, but if we are past the headers,
 			// we should always add the line. If headerSection is still true here, it means
 			// it's the first line of content after potential headers were skipped.
 			cleanLines = append(cleanLines, line)
 		}
 	}
 		content = strings.Join(cleanLines, "\n")
 		logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
 	} else {
 		logger.Debug("cleanMessageContent: Skipping header stripping")
 	}
 	// Convert newlines to HTML breaks for display
-	content = strings.ReplaceAll(content, "\r\n", "<br>\n")
+	// First, normalize all newlines (\r\n or \n) to just \n
 	content = strings.ReplaceAll(content, "\r\n", "\n")
 	// Then, replace each \n with <br>\n
 	content = strings.ReplaceAll(content, "\n", "<br>\n")
 	logger.WithField("contentAfterNewlineConversionLength", len(content)).Debug("cleanMessageContent: Content after newline conversion")
@@ -474,26 +487,25 @@ func cleanMessageContent(content string) string {
 func fallbackExtractContent(body string) string {
 	logger.WithField("bodyLength", len(body)).Debug("Using fallback content extraction method")
 	logger.WithField("rawInputBodyFallbackLength", len(body)).Debug("fallbackExtractContent: Raw input body")
 	var content string
 	parts := strings.Split(body, "\r\n\r\n")
 	if len(parts) > 1 {
-		content := strings.Join(parts[1:], "\r\n\r\n")
+		content = strings.Join(parts[1:], "\r\n\r\n")
 		content = strings.ReplaceAll(content, "\r\n", "<br>\n")
 		content = strings.ReplaceAll(content, "\n", "<br>\n")
 		logger.WithFields(logrus.Fields{
 			"contentLength": len(content),
 			"partsCount":    len(parts),
-		}).Debug("Successfully extracted content using fallback method")
+		}).Debug("Successfully extracted content using fallback method (from parts)")
 		logger.WithField("extractedContentFallbackLength", len(content)).Debug("fallbackExtractContent: Content from splitting parts")
-		return content
+	} else {
-	}
+		content = body
 	content := body
 	content = strings.ReplaceAll(content, "\r\n", "<br>\n")
 	content = strings.ReplaceAll(content, "\n", "<br>\n")
 		logger.WithFields(logrus.Fields{
 			"contentLength": len(content),
 			"fullBody":      true,
 		}).Debug("Using full body as content in fallback method")
-	logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body")
+		logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body (no parts split)")
 	}
 	// Newline conversion and signature stripping will be handled by cleanMessageContent,
 	// which is now called by the callers of fallbackExtractContent.
 	return content
 }