From e248e26b2d3e5f1c2f418d1b99e39927d107ae30 Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Thu, 29 May 2025 20:16:12 +0200 Subject: [PATCH] feat: Improve email content extraction by enhancing header stripping and cleaning logic --- internal/ai/ai.go | 2 +- internal/imap/imap.go | 90 ++++++++++++++++++++++++------------------- 2 files changed, 52 insertions(+), 40 deletions(-) diff --git a/internal/ai/ai.go b/internal/ai/ai.go index 550927b..65d8396 100644 --- a/internal/ai/ai.go +++ b/internal/ai/ai.go @@ -94,7 +94,7 @@ While the 'Email Body' provides the question, your answer should be synthesized Instructions: - Language: Your response must be entirely in %s, regardless of the language used in the email content or context. -- Format: Your reply must be raw HTML. Use appropriate HTML tags for structure and styling. +- Format: CRITICAL: Your reply MUST be raw HTML. Use appropriate HTML tags for structure and styling. For example, wrap paragraphs in

...

tags and use
for line breaks if needed within a paragraph. Even a short sentence must be wrapped in HTML (e.g.,

Yes.

). - Markdown: Do NOT wrap the HTML in markdown code blocks (e.g., %s). - Extraneous Text: Do not include a subject line. Do not include explanations, commentary, or any extra text that is not part of the direct answer. - Closing: Avoid generic closing statements like "If you have further questions...". Focus solely on answering the email. diff --git a/internal/imap/imap.go b/internal/imap/imap.go index 929e270..91bcb22 100644 --- a/internal/imap/imap.go +++ b/internal/imap/imap.go @@ -295,7 +295,8 @@ func extractMessageContent(body string) string { "error": err, "bodyLength": len(body), }).Debug("Failed to parse email message, falling back to simple extraction") - return fallbackExtractContent(body) + // When ReadMessage fails, the body is raw, so header stripping is needed. + return cleanMessageContent(fallbackExtractContent(body), true) } contentTypeHeader := msg.Header.Get("Content-Type") @@ -308,7 +309,8 @@ func extractMessageContent(body string) string { "error": err, "contentTypeHeader": contentTypeHeader, }).Debug("Failed to parse Content-Type header, falling back to simple extraction") - return fallbackExtractContent(body) + // When ParseMediaType fails, the body is raw, so header stripping is needed. + return cleanMessageContent(fallbackExtractContent(body), true) } logger.WithFields(logrus.Fields{ @@ -345,12 +347,15 @@ func extractMessageContent(body string) string { if content == "" { logger.Debug("No content extracted, falling back to simple extraction") logger.Debug("extractMessageContent: No content from primary extraction, falling back.") - return fallbackExtractContent(body) + // When primary extraction yields no content, the body is raw, so header stripping is needed. + return cleanMessageContent(fallbackExtractContent(body), true) } // Clean up the content logger.WithField("contentBeforeClean", content).Debug("extractMessageContent: Content before cleanMessageContent") - content = cleanMessageContent(content) + // When ReadMessage succeeds, 'content' is already the message body (or part body), + // so no further header stripping should be done. + content = cleanMessageContent(content, false) logger.WithField("contentAfterClean", content).Debug("extractMessageContent: Content after cleanMessageContent") logger.WithField("contentLength", len(content)).Debug("Successfully extracted and cleaned message content") @@ -426,38 +431,46 @@ func handleSinglePartMessage(reader io.Reader) string { return content } -func cleanMessageContent(content string) string { +func cleanMessageContent(content string, performHeaderStripping bool) string { logger.WithField("inputContentLength", len(content)).Debug("cleanMessageContent: Starting") - // Remove any remaining email headers that might be in the body - lines := strings.Split(content, "\n") - var cleanLines []string - headerSection := true + logger.WithField("performHeaderStripping", performHeaderStripping).Debug("cleanMessageContent: performHeaderStripping flag") - for _, line := range lines { - trimmed := strings.TrimSpace(line) + if performHeaderStripping { + // Remove any remaining email headers that might be in the body + lines := strings.Split(content, "\n") + var cleanLines []string + headerSection := true - // Empty line marks the end of headers - if headerSection && trimmed == "" { - headerSection = false - continue - } + for _, line := range lines { + trimmed := strings.TrimSpace(line) - // Skip header lines - if headerSection && (strings.Contains(trimmed, ":") || trimmed == "") { - continue - } + // Empty line marks the end of headers + if headerSection && trimmed == "" { + headerSection = false + continue + } - // Add non-header lines - if !headerSection { + // Skip header lines + if headerSection && (strings.Contains(trimmed, ":") || trimmed == "") { + continue + } + + // Add non-header lines + // This condition was originally !headerSection, but if we are past the headers, + // we should always add the line. If headerSection is still true here, it means + // it's the first line of content after potential headers were skipped. cleanLines = append(cleanLines, line) } + content = strings.Join(cleanLines, "\n") + logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping") + } else { + logger.Debug("cleanMessageContent: Skipping header stripping") } - content = strings.Join(cleanLines, "\n") - logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping") - // Convert newlines to HTML breaks for display - content = strings.ReplaceAll(content, "\r\n", "
\n") + // First, normalize all newlines (\r\n or \n) to just \n + content = strings.ReplaceAll(content, "\r\n", "\n") + // Then, replace each \n with
\n content = strings.ReplaceAll(content, "\n", "
\n") logger.WithField("contentAfterNewlineConversionLength", len(content)).Debug("cleanMessageContent: Content after newline conversion") @@ -474,26 +487,25 @@ func cleanMessageContent(content string) string { func fallbackExtractContent(body string) string { logger.WithField("bodyLength", len(body)).Debug("Using fallback content extraction method") logger.WithField("rawInputBodyFallbackLength", len(body)).Debug("fallbackExtractContent: Raw input body") + var content string parts := strings.Split(body, "\r\n\r\n") if len(parts) > 1 { - content := strings.Join(parts[1:], "\r\n\r\n") - content = strings.ReplaceAll(content, "\r\n", "
\n") - content = strings.ReplaceAll(content, "\n", "
\n") + content = strings.Join(parts[1:], "\r\n\r\n") logger.WithFields(logrus.Fields{ "contentLength": len(content), "partsCount": len(parts), - }).Debug("Successfully extracted content using fallback method") + }).Debug("Successfully extracted content using fallback method (from parts)") logger.WithField("extractedContentFallbackLength", len(content)).Debug("fallbackExtractContent: Content from splitting parts") - return content + } else { + content = body + logger.WithFields(logrus.Fields{ + "contentLength": len(content), + "fullBody": true, + }).Debug("Using full body as content in fallback method") + logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body (no parts split)") } - content := body - content = strings.ReplaceAll(content, "\r\n", "
\n") - content = strings.ReplaceAll(content, "\n", "
\n") - logger.WithFields(logrus.Fields{ - "contentLength": len(content), - "fullBody": true, - }).Debug("Using full body as content in fallback method") - logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body") + // Newline conversion and signature stripping will be handled by cleanMessageContent, + // which is now called by the callers of fallbackExtractContent. return content }