feat: Improve email content extraction by enhancing header stripping and cleaning logic

This commit is contained in:
2025-05-29 20:16:12 +02:00
parent 1756f3462b
commit e248e26b2d
2 changed files with 52 additions and 40 deletions

View File

@@ -94,7 +94,7 @@ While the 'Email Body' provides the question, your answer should be synthesized
Instructions: Instructions:
- Language: Your response must be entirely in %s, regardless of the language used in the email content or context. - Language: Your response must be entirely in %s, regardless of the language used in the email content or context.
- Format: Your reply must be raw HTML. Use appropriate HTML tags for structure and styling. - Format: CRITICAL: Your reply MUST be raw HTML. Use appropriate HTML tags for structure and styling. For example, wrap paragraphs in <p>...</p> tags and use <br> for line breaks if needed within a paragraph. Even a short sentence must be wrapped in HTML (e.g., <p>Yes.</p>).
- Markdown: Do NOT wrap the HTML in markdown code blocks (e.g., %s). - Markdown: Do NOT wrap the HTML in markdown code blocks (e.g., %s).
- Extraneous Text: Do not include a subject line. Do not include explanations, commentary, or any extra text that is not part of the direct answer. - Extraneous Text: Do not include a subject line. Do not include explanations, commentary, or any extra text that is not part of the direct answer.
- Closing: Avoid generic closing statements like "If you have further questions...". Focus solely on answering the email. - Closing: Avoid generic closing statements like "If you have further questions...". Focus solely on answering the email.

View File

@@ -295,7 +295,8 @@ func extractMessageContent(body string) string {
"error": err, "error": err,
"bodyLength": len(body), "bodyLength": len(body),
}).Debug("Failed to parse email message, falling back to simple extraction") }).Debug("Failed to parse email message, falling back to simple extraction")
return fallbackExtractContent(body) // When ReadMessage fails, the body is raw, so header stripping is needed.
return cleanMessageContent(fallbackExtractContent(body), true)
} }
contentTypeHeader := msg.Header.Get("Content-Type") contentTypeHeader := msg.Header.Get("Content-Type")
@@ -308,7 +309,8 @@ func extractMessageContent(body string) string {
"error": err, "error": err,
"contentTypeHeader": contentTypeHeader, "contentTypeHeader": contentTypeHeader,
}).Debug("Failed to parse Content-Type header, falling back to simple extraction") }).Debug("Failed to parse Content-Type header, falling back to simple extraction")
return fallbackExtractContent(body) // When ParseMediaType fails, the body is raw, so header stripping is needed.
return cleanMessageContent(fallbackExtractContent(body), true)
} }
logger.WithFields(logrus.Fields{ logger.WithFields(logrus.Fields{
@@ -345,12 +347,15 @@ func extractMessageContent(body string) string {
if content == "" { if content == "" {
logger.Debug("No content extracted, falling back to simple extraction") logger.Debug("No content extracted, falling back to simple extraction")
logger.Debug("extractMessageContent: No content from primary extraction, falling back.") logger.Debug("extractMessageContent: No content from primary extraction, falling back.")
return fallbackExtractContent(body) // When primary extraction yields no content, the body is raw, so header stripping is needed.
return cleanMessageContent(fallbackExtractContent(body), true)
} }
// Clean up the content // Clean up the content
logger.WithField("contentBeforeClean", content).Debug("extractMessageContent: Content before cleanMessageContent") logger.WithField("contentBeforeClean", content).Debug("extractMessageContent: Content before cleanMessageContent")
content = cleanMessageContent(content) // When ReadMessage succeeds, 'content' is already the message body (or part body),
// so no further header stripping should be done.
content = cleanMessageContent(content, false)
logger.WithField("contentAfterClean", content).Debug("extractMessageContent: Content after cleanMessageContent") logger.WithField("contentAfterClean", content).Debug("extractMessageContent: Content after cleanMessageContent")
logger.WithField("contentLength", len(content)).Debug("Successfully extracted and cleaned message content") logger.WithField("contentLength", len(content)).Debug("Successfully extracted and cleaned message content")
@@ -426,8 +431,11 @@ func handleSinglePartMessage(reader io.Reader) string {
return content return content
} }
func cleanMessageContent(content string) string { func cleanMessageContent(content string, performHeaderStripping bool) string {
logger.WithField("inputContentLength", len(content)).Debug("cleanMessageContent: Starting") logger.WithField("inputContentLength", len(content)).Debug("cleanMessageContent: Starting")
logger.WithField("performHeaderStripping", performHeaderStripping).Debug("cleanMessageContent: performHeaderStripping flag")
if performHeaderStripping {
// Remove any remaining email headers that might be in the body // Remove any remaining email headers that might be in the body
lines := strings.Split(content, "\n") lines := strings.Split(content, "\n")
var cleanLines []string var cleanLines []string
@@ -448,16 +456,21 @@ func cleanMessageContent(content string) string {
} }
// Add non-header lines // Add non-header lines
if !headerSection { // This condition was originally !headerSection, but if we are past the headers,
// we should always add the line. If headerSection is still true here, it means
// it's the first line of content after potential headers were skipped.
cleanLines = append(cleanLines, line) cleanLines = append(cleanLines, line)
} }
}
content = strings.Join(cleanLines, "\n") content = strings.Join(cleanLines, "\n")
logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping") logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
} else {
logger.Debug("cleanMessageContent: Skipping header stripping")
}
// Convert newlines to HTML breaks for display // Convert newlines to HTML breaks for display
content = strings.ReplaceAll(content, "\r\n", "<br>\n") // First, normalize all newlines (\r\n or \n) to just \n
content = strings.ReplaceAll(content, "\r\n", "\n")
// Then, replace each \n with <br>\n
content = strings.ReplaceAll(content, "\n", "<br>\n") content = strings.ReplaceAll(content, "\n", "<br>\n")
logger.WithField("contentAfterNewlineConversionLength", len(content)).Debug("cleanMessageContent: Content after newline conversion") logger.WithField("contentAfterNewlineConversionLength", len(content)).Debug("cleanMessageContent: Content after newline conversion")
@@ -474,26 +487,25 @@ func cleanMessageContent(content string) string {
func fallbackExtractContent(body string) string { func fallbackExtractContent(body string) string {
logger.WithField("bodyLength", len(body)).Debug("Using fallback content extraction method") logger.WithField("bodyLength", len(body)).Debug("Using fallback content extraction method")
logger.WithField("rawInputBodyFallbackLength", len(body)).Debug("fallbackExtractContent: Raw input body") logger.WithField("rawInputBodyFallbackLength", len(body)).Debug("fallbackExtractContent: Raw input body")
var content string
parts := strings.Split(body, "\r\n\r\n") parts := strings.Split(body, "\r\n\r\n")
if len(parts) > 1 { if len(parts) > 1 {
content := strings.Join(parts[1:], "\r\n\r\n") content = strings.Join(parts[1:], "\r\n\r\n")
content = strings.ReplaceAll(content, "\r\n", "<br>\n")
content = strings.ReplaceAll(content, "\n", "<br>\n")
logger.WithFields(logrus.Fields{ logger.WithFields(logrus.Fields{
"contentLength": len(content), "contentLength": len(content),
"partsCount": len(parts), "partsCount": len(parts),
}).Debug("Successfully extracted content using fallback method") }).Debug("Successfully extracted content using fallback method (from parts)")
logger.WithField("extractedContentFallbackLength", len(content)).Debug("fallbackExtractContent: Content from splitting parts") logger.WithField("extractedContentFallbackLength", len(content)).Debug("fallbackExtractContent: Content from splitting parts")
return content } else {
} content = body
content := body
content = strings.ReplaceAll(content, "\r\n", "<br>\n")
content = strings.ReplaceAll(content, "\n", "<br>\n")
logger.WithFields(logrus.Fields{ logger.WithFields(logrus.Fields{
"contentLength": len(content), "contentLength": len(content),
"fullBody": true, "fullBody": true,
}).Debug("Using full body as content in fallback method") }).Debug("Using full body as content in fallback method")
logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body") logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body (no parts split)")
}
// Newline conversion and signature stripping will be handled by cleanMessageContent,
// which is now called by the callers of fallbackExtractContent.
return content return content
} }