feat: Improve email content extraction by enhancing header stripping and cleaning logic

This commit is contained in:
2025-05-29 20:16:12 +02:00
parent 1756f3462b
commit e248e26b2d
2 changed files with 52 additions and 40 deletions

View File

@@ -295,7 +295,8 @@ func extractMessageContent(body string) string {
"error": err,
"bodyLength": len(body),
}).Debug("Failed to parse email message, falling back to simple extraction")
return fallbackExtractContent(body)
// When ReadMessage fails, the body is raw, so header stripping is needed.
return cleanMessageContent(fallbackExtractContent(body), true)
}
contentTypeHeader := msg.Header.Get("Content-Type")
@@ -308,7 +309,8 @@ func extractMessageContent(body string) string {
"error": err,
"contentTypeHeader": contentTypeHeader,
}).Debug("Failed to parse Content-Type header, falling back to simple extraction")
return fallbackExtractContent(body)
// When ParseMediaType fails, the body is raw, so header stripping is needed.
return cleanMessageContent(fallbackExtractContent(body), true)
}
logger.WithFields(logrus.Fields{
@@ -345,12 +347,15 @@ func extractMessageContent(body string) string {
if content == "" {
logger.Debug("No content extracted, falling back to simple extraction")
logger.Debug("extractMessageContent: No content from primary extraction, falling back.")
return fallbackExtractContent(body)
// When primary extraction yields no content, the body is raw, so header stripping is needed.
return cleanMessageContent(fallbackExtractContent(body), true)
}
// Clean up the content
logger.WithField("contentBeforeClean", content).Debug("extractMessageContent: Content before cleanMessageContent")
content = cleanMessageContent(content)
// When ReadMessage succeeds, 'content' is already the message body (or part body),
// so no further header stripping should be done.
content = cleanMessageContent(content, false)
logger.WithField("contentAfterClean", content).Debug("extractMessageContent: Content after cleanMessageContent")
logger.WithField("contentLength", len(content)).Debug("Successfully extracted and cleaned message content")
@@ -426,38 +431,46 @@ func handleSinglePartMessage(reader io.Reader) string {
return content
}
func cleanMessageContent(content string) string {
func cleanMessageContent(content string, performHeaderStripping bool) string {
logger.WithField("inputContentLength", len(content)).Debug("cleanMessageContent: Starting")
// Remove any remaining email headers that might be in the body
lines := strings.Split(content, "\n")
var cleanLines []string
headerSection := true
logger.WithField("performHeaderStripping", performHeaderStripping).Debug("cleanMessageContent: performHeaderStripping flag")
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if performHeaderStripping {
// Remove any remaining email headers that might be in the body
lines := strings.Split(content, "\n")
var cleanLines []string
headerSection := true
// Empty line marks the end of headers
if headerSection && trimmed == "" {
headerSection = false
continue
}
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// Skip header lines
if headerSection && (strings.Contains(trimmed, ":") || trimmed == "") {
continue
}
// Empty line marks the end of headers
if headerSection && trimmed == "" {
headerSection = false
continue
}
// Add non-header lines
if !headerSection {
// Skip header lines
if headerSection && (strings.Contains(trimmed, ":") || trimmed == "") {
continue
}
// Add non-header lines
// This condition was originally !headerSection, but if we are past the headers,
// we should always add the line. If headerSection is still true here, it means
// it's the first line of content after potential headers were skipped.
cleanLines = append(cleanLines, line)
}
content = strings.Join(cleanLines, "\n")
logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
} else {
logger.Debug("cleanMessageContent: Skipping header stripping")
}
content = strings.Join(cleanLines, "\n")
logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
// Convert newlines to HTML breaks for display
content = strings.ReplaceAll(content, "\r\n", "<br>\n")
// First, normalize all newlines (\r\n or \n) to just \n
content = strings.ReplaceAll(content, "\r\n", "\n")
// Then, replace each \n with <br>\n
content = strings.ReplaceAll(content, "\n", "<br>\n")
logger.WithField("contentAfterNewlineConversionLength", len(content)).Debug("cleanMessageContent: Content after newline conversion")
@@ -474,26 +487,25 @@ func cleanMessageContent(content string) string {
func fallbackExtractContent(body string) string {
logger.WithField("bodyLength", len(body)).Debug("Using fallback content extraction method")
logger.WithField("rawInputBodyFallbackLength", len(body)).Debug("fallbackExtractContent: Raw input body")
var content string
parts := strings.Split(body, "\r\n\r\n")
if len(parts) > 1 {
content := strings.Join(parts[1:], "\r\n\r\n")
content = strings.ReplaceAll(content, "\r\n", "<br>\n")
content = strings.ReplaceAll(content, "\n", "<br>\n")
content = strings.Join(parts[1:], "\r\n\r\n")
logger.WithFields(logrus.Fields{
"contentLength": len(content),
"partsCount": len(parts),
}).Debug("Successfully extracted content using fallback method")
}).Debug("Successfully extracted content using fallback method (from parts)")
logger.WithField("extractedContentFallbackLength", len(content)).Debug("fallbackExtractContent: Content from splitting parts")
return content
} else {
content = body
logger.WithFields(logrus.Fields{
"contentLength": len(content),
"fullBody": true,
}).Debug("Using full body as content in fallback method")
logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body (no parts split)")
}
content := body
content = strings.ReplaceAll(content, "\r\n", "<br>\n")
content = strings.ReplaceAll(content, "\n", "<br>\n")
logger.WithFields(logrus.Fields{
"contentLength": len(content),
"fullBody": true,
}).Debug("Using full body as content in fallback method")
logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body")
// Newline conversion and signature stripping will be handled by cleanMessageContent,
// which is now called by the callers of fallbackExtractContent.
return content
}