From e248e26b2d3e5f1c2f418d1b99e39927d107ae30 Mon Sep 17 00:00:00 2001
From: Dominik Polakovics <dominik.polakovics@cloonar.com>
Date: Thu, 29 May 2025 20:16:12 +0200
Subject: [PATCH] feat: Improve email content extraction by enhancing header
 stripping and cleaning logic

---
 internal/ai/ai.go     |  2 +-
 internal/imap/imap.go | 90 ++++++++++++++++++++++++-------------------
 2 files changed, 52 insertions(+), 40 deletions(-)
diff --git a/internal/ai/ai.go b/internal/ai/ai.go
index 550927b..65d8396 100644
--- a/internal/ai/ai.go
+++ b/internal/ai/ai.go
@@ -94,7 +94,7 @@ While the 'Email Body' provides the question, your answer should be synthesized
 
 Instructions:
 - Language: Your response must be entirely in %s, regardless of the language used in the email content or context.
-- Format: Your reply must be raw HTML. Use appropriate HTML tags for structure and styling.
+- Format: CRITICAL: Your reply MUST be raw HTML. Use appropriate HTML tags for structure and styling. For example, wrap paragraphs in <p>...</p> tags and use <br> for line breaks if needed within a paragraph. Even a short sentence must be wrapped in HTML (e.g., <p>Yes.</p>).
 - Markdown: Do NOT wrap the HTML in markdown code blocks (e.g., %s).
 - Extraneous Text: Do not include a subject line. Do not include explanations, commentary, or any extra text that is not part of the direct answer.
 - Closing: Avoid generic closing statements like "If you have further questions...". Focus solely on answering the email.
diff --git a/internal/imap/imap.go b/internal/imap/imap.go
index 929e270..91bcb22 100644
--- a/internal/imap/imap.go
+++ b/internal/imap/imap.go
@@ -295,7 +295,8 @@ func extractMessageContent(body string) string {
 			"error":      err,
 			"bodyLength": len(body),
 		}).Debug("Failed to parse email message, falling back to simple extraction")
-		return fallbackExtractContent(body)
+		// When ReadMessage fails, the body is raw, so header stripping is needed.
+		return cleanMessageContent(fallbackExtractContent(body), true)
 	}
 
 	contentTypeHeader := msg.Header.Get("Content-Type")
@@ -308,7 +309,8 @@ func extractMessageContent(body string) string {
 			"error":             err,
 			"contentTypeHeader": contentTypeHeader,
 		}).Debug("Failed to parse Content-Type header, falling back to simple extraction")
-		return fallbackExtractContent(body)
+		// When ParseMediaType fails, the body is raw, so header stripping is needed.
+		return cleanMessageContent(fallbackExtractContent(body), true)
 	}
 
 	logger.WithFields(logrus.Fields{
@@ -345,12 +347,15 @@ func extractMessageContent(body string) string {
 	if content == "" {
 		logger.Debug("No content extracted, falling back to simple extraction")
 		logger.Debug("extractMessageContent: No content from primary extraction, falling back.")
-		return fallbackExtractContent(body)
+		// When primary extraction yields no content, the body is raw, so header stripping is needed.
+		return cleanMessageContent(fallbackExtractContent(body), true)
 	}
 
 	// Clean up the content
 	logger.WithField("contentBeforeClean", content).Debug("extractMessageContent: Content before cleanMessageContent")
-	content = cleanMessageContent(content)
+	// When ReadMessage succeeds, 'content' is already the message body (or part body),
+	// so no further header stripping should be done.
+	content = cleanMessageContent(content, false)
 	logger.WithField("contentAfterClean", content).Debug("extractMessageContent: Content after cleanMessageContent")
 
 	logger.WithField("contentLength", len(content)).Debug("Successfully extracted and cleaned message content")
@@ -426,38 +431,46 @@ func handleSinglePartMessage(reader io.Reader) string {
 	return content
 }
 
-func cleanMessageContent(content string) string {
+func cleanMessageContent(content string, performHeaderStripping bool) string {
 	logger.WithField("inputContentLength", len(content)).Debug("cleanMessageContent: Starting")
-	// Remove any remaining email headers that might be in the body
-	lines := strings.Split(content, "\n")
-	var cleanLines []string
-	headerSection := true
+	logger.WithField("performHeaderStripping", performHeaderStripping).Debug("cleanMessageContent: performHeaderStripping flag")
 
-	for _, line := range lines {
-		trimmed := strings.TrimSpace(line)
+	if performHeaderStripping {
+		// Remove any remaining email headers that might be in the body
+		lines := strings.Split(content, "\n")
+		var cleanLines []string
+		headerSection := true
 
-		// Empty line marks the end of headers
-		if headerSection && trimmed == "" {
-			headerSection = false
-			continue
-		}
+		for _, line := range lines {
+			trimmed := strings.TrimSpace(line)
 
-		// Skip header lines
-		if headerSection && (strings.Contains(trimmed, ":") || trimmed == "") {
-			continue
-		}
+			// Empty line marks the end of headers
+			if headerSection && trimmed == "" {
+				headerSection = false
+				continue
+			}
 
-		// Add non-header lines
-		if !headerSection {
+			// Skip header lines
+			if headerSection && (strings.Contains(trimmed, ":") || trimmed == "") {
+				continue
+			}
+
+			// Add non-header lines
+			// This condition was originally !headerSection, but if we are past the headers,
+			// we should always add the line. If headerSection is still true here, it means
+			// it's the first line of content after potential headers were skipped.
 			cleanLines = append(cleanLines, line)
 		}
+		content = strings.Join(cleanLines, "\n")
+		logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
+	} else {
+		logger.Debug("cleanMessageContent: Skipping header stripping")
 	}
 
-	content = strings.Join(cleanLines, "\n")
-	logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
-
 	// Convert newlines to HTML breaks for display
-	content = strings.ReplaceAll(content, "\r\n", "<br>\n")
+	// First, normalize all newlines (\r\n or \n) to just \n
+	content = strings.ReplaceAll(content, "\r\n", "\n")
+	// Then, replace each \n with <br>\n
 	content = strings.ReplaceAll(content, "\n", "<br>\n")
 	logger.WithField("contentAfterNewlineConversionLength", len(content)).Debug("cleanMessageContent: Content after newline conversion")
 
@@ -474,26 +487,25 @@ func cleanMessageContent(content string) string {
 func fallbackExtractContent(body string) string {
 	logger.WithField("bodyLength", len(body)).Debug("Using fallback content extraction method")
 	logger.WithField("rawInputBodyFallbackLength", len(body)).Debug("fallbackExtractContent: Raw input body")
+	var content string
 	parts := strings.Split(body, "\r\n\r\n")
 	if len(parts) > 1 {
-		content := strings.Join(parts[1:], "\r\n\r\n")
-		content = strings.ReplaceAll(content, "\r\n", "<br>\n")
-		content = strings.ReplaceAll(content, "\n", "<br>\n")
+		content = strings.Join(parts[1:], "\r\n\r\n")
 		logger.WithFields(logrus.Fields{
 			"contentLength": len(content),
 			"partsCount":    len(parts),
-		}).Debug("Successfully extracted content using fallback method")
+		}).Debug("Successfully extracted content using fallback method (from parts)")
 		logger.WithField("extractedContentFallbackLength", len(content)).Debug("fallbackExtractContent: Content from splitting parts")
-		return content
+	} else {
+		content = body
+		logger.WithFields(logrus.Fields{
+			"contentLength": len(content),
+			"fullBody":      true,
+		}).Debug("Using full body as content in fallback method")
+		logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body (no parts split)")
 	}
-	content := body
-	content = strings.ReplaceAll(content, "\r\n", "<br>\n")
-	content = strings.ReplaceAll(content, "\n", "<br>\n")
-	logger.WithFields(logrus.Fields{
-		"contentLength": len(content),
-		"fullBody":      true,
-	}).Debug("Using full body as content in fallback method")
-	logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body")
+	// Newline conversion and signature stripping will be handled by cleanMessageContent,
+	// which is now called by the callers of fallbackExtractContent.
 	return content
 }