feat: Improve email content extraction by enhancing header stripping and cleaning logic
This commit is contained in:
@@ -94,7 +94,7 @@ While the 'Email Body' provides the question, your answer should be synthesized
|
|||||||
|
|
||||||
Instructions:
|
Instructions:
|
||||||
- Language: Your response must be entirely in %s, regardless of the language used in the email content or context.
|
- Language: Your response must be entirely in %s, regardless of the language used in the email content or context.
|
||||||
- Format: Your reply must be raw HTML. Use appropriate HTML tags for structure and styling.
|
- Format: CRITICAL: Your reply MUST be raw HTML. Use appropriate HTML tags for structure and styling. For example, wrap paragraphs in <p>...</p> tags and use <br> for line breaks if needed within a paragraph. Even a short sentence must be wrapped in HTML (e.g., <p>Yes.</p>).
|
||||||
- Markdown: Do NOT wrap the HTML in markdown code blocks (e.g., %s).
|
- Markdown: Do NOT wrap the HTML in markdown code blocks (e.g., %s).
|
||||||
- Extraneous Text: Do not include a subject line. Do not include explanations, commentary, or any extra text that is not part of the direct answer.
|
- Extraneous Text: Do not include a subject line. Do not include explanations, commentary, or any extra text that is not part of the direct answer.
|
||||||
- Closing: Avoid generic closing statements like "If you have further questions...". Focus solely on answering the email.
|
- Closing: Avoid generic closing statements like "If you have further questions...". Focus solely on answering the email.
|
||||||
|
|||||||
@@ -295,7 +295,8 @@ func extractMessageContent(body string) string {
|
|||||||
"error": err,
|
"error": err,
|
||||||
"bodyLength": len(body),
|
"bodyLength": len(body),
|
||||||
}).Debug("Failed to parse email message, falling back to simple extraction")
|
}).Debug("Failed to parse email message, falling back to simple extraction")
|
||||||
return fallbackExtractContent(body)
|
// When ReadMessage fails, the body is raw, so header stripping is needed.
|
||||||
|
return cleanMessageContent(fallbackExtractContent(body), true)
|
||||||
}
|
}
|
||||||
|
|
||||||
contentTypeHeader := msg.Header.Get("Content-Type")
|
contentTypeHeader := msg.Header.Get("Content-Type")
|
||||||
@@ -308,7 +309,8 @@ func extractMessageContent(body string) string {
|
|||||||
"error": err,
|
"error": err,
|
||||||
"contentTypeHeader": contentTypeHeader,
|
"contentTypeHeader": contentTypeHeader,
|
||||||
}).Debug("Failed to parse Content-Type header, falling back to simple extraction")
|
}).Debug("Failed to parse Content-Type header, falling back to simple extraction")
|
||||||
return fallbackExtractContent(body)
|
// When ParseMediaType fails, the body is raw, so header stripping is needed.
|
||||||
|
return cleanMessageContent(fallbackExtractContent(body), true)
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.WithFields(logrus.Fields{
|
logger.WithFields(logrus.Fields{
|
||||||
@@ -345,12 +347,15 @@ func extractMessageContent(body string) string {
|
|||||||
if content == "" {
|
if content == "" {
|
||||||
logger.Debug("No content extracted, falling back to simple extraction")
|
logger.Debug("No content extracted, falling back to simple extraction")
|
||||||
logger.Debug("extractMessageContent: No content from primary extraction, falling back.")
|
logger.Debug("extractMessageContent: No content from primary extraction, falling back.")
|
||||||
return fallbackExtractContent(body)
|
// When primary extraction yields no content, the body is raw, so header stripping is needed.
|
||||||
|
return cleanMessageContent(fallbackExtractContent(body), true)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean up the content
|
// Clean up the content
|
||||||
logger.WithField("contentBeforeClean", content).Debug("extractMessageContent: Content before cleanMessageContent")
|
logger.WithField("contentBeforeClean", content).Debug("extractMessageContent: Content before cleanMessageContent")
|
||||||
content = cleanMessageContent(content)
|
// When ReadMessage succeeds, 'content' is already the message body (or part body),
|
||||||
|
// so no further header stripping should be done.
|
||||||
|
content = cleanMessageContent(content, false)
|
||||||
logger.WithField("contentAfterClean", content).Debug("extractMessageContent: Content after cleanMessageContent")
|
logger.WithField("contentAfterClean", content).Debug("extractMessageContent: Content after cleanMessageContent")
|
||||||
|
|
||||||
logger.WithField("contentLength", len(content)).Debug("Successfully extracted and cleaned message content")
|
logger.WithField("contentLength", len(content)).Debug("Successfully extracted and cleaned message content")
|
||||||
@@ -426,8 +431,11 @@ func handleSinglePartMessage(reader io.Reader) string {
|
|||||||
return content
|
return content
|
||||||
}
|
}
|
||||||
|
|
||||||
func cleanMessageContent(content string) string {
|
func cleanMessageContent(content string, performHeaderStripping bool) string {
|
||||||
logger.WithField("inputContentLength", len(content)).Debug("cleanMessageContent: Starting")
|
logger.WithField("inputContentLength", len(content)).Debug("cleanMessageContent: Starting")
|
||||||
|
logger.WithField("performHeaderStripping", performHeaderStripping).Debug("cleanMessageContent: performHeaderStripping flag")
|
||||||
|
|
||||||
|
if performHeaderStripping {
|
||||||
// Remove any remaining email headers that might be in the body
|
// Remove any remaining email headers that might be in the body
|
||||||
lines := strings.Split(content, "\n")
|
lines := strings.Split(content, "\n")
|
||||||
var cleanLines []string
|
var cleanLines []string
|
||||||
@@ -448,16 +456,21 @@ func cleanMessageContent(content string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add non-header lines
|
// Add non-header lines
|
||||||
if !headerSection {
|
// This condition was originally !headerSection, but if we are past the headers,
|
||||||
|
// we should always add the line. If headerSection is still true here, it means
|
||||||
|
// it's the first line of content after potential headers were skipped.
|
||||||
cleanLines = append(cleanLines, line)
|
cleanLines = append(cleanLines, line)
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
content = strings.Join(cleanLines, "\n")
|
content = strings.Join(cleanLines, "\n")
|
||||||
logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
|
logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
|
||||||
|
} else {
|
||||||
|
logger.Debug("cleanMessageContent: Skipping header stripping")
|
||||||
|
}
|
||||||
|
|
||||||
// Convert newlines to HTML breaks for display
|
// Convert newlines to HTML breaks for display
|
||||||
content = strings.ReplaceAll(content, "\r\n", "<br>\n")
|
// First, normalize all newlines (\r\n or \n) to just \n
|
||||||
|
content = strings.ReplaceAll(content, "\r\n", "\n")
|
||||||
|
// Then, replace each \n with <br>\n
|
||||||
content = strings.ReplaceAll(content, "\n", "<br>\n")
|
content = strings.ReplaceAll(content, "\n", "<br>\n")
|
||||||
logger.WithField("contentAfterNewlineConversionLength", len(content)).Debug("cleanMessageContent: Content after newline conversion")
|
logger.WithField("contentAfterNewlineConversionLength", len(content)).Debug("cleanMessageContent: Content after newline conversion")
|
||||||
|
|
||||||
@@ -474,26 +487,25 @@ func cleanMessageContent(content string) string {
|
|||||||
func fallbackExtractContent(body string) string {
|
func fallbackExtractContent(body string) string {
|
||||||
logger.WithField("bodyLength", len(body)).Debug("Using fallback content extraction method")
|
logger.WithField("bodyLength", len(body)).Debug("Using fallback content extraction method")
|
||||||
logger.WithField("rawInputBodyFallbackLength", len(body)).Debug("fallbackExtractContent: Raw input body")
|
logger.WithField("rawInputBodyFallbackLength", len(body)).Debug("fallbackExtractContent: Raw input body")
|
||||||
|
var content string
|
||||||
parts := strings.Split(body, "\r\n\r\n")
|
parts := strings.Split(body, "\r\n\r\n")
|
||||||
if len(parts) > 1 {
|
if len(parts) > 1 {
|
||||||
content := strings.Join(parts[1:], "\r\n\r\n")
|
content = strings.Join(parts[1:], "\r\n\r\n")
|
||||||
content = strings.ReplaceAll(content, "\r\n", "<br>\n")
|
|
||||||
content = strings.ReplaceAll(content, "\n", "<br>\n")
|
|
||||||
logger.WithFields(logrus.Fields{
|
logger.WithFields(logrus.Fields{
|
||||||
"contentLength": len(content),
|
"contentLength": len(content),
|
||||||
"partsCount": len(parts),
|
"partsCount": len(parts),
|
||||||
}).Debug("Successfully extracted content using fallback method")
|
}).Debug("Successfully extracted content using fallback method (from parts)")
|
||||||
logger.WithField("extractedContentFallbackLength", len(content)).Debug("fallbackExtractContent: Content from splitting parts")
|
logger.WithField("extractedContentFallbackLength", len(content)).Debug("fallbackExtractContent: Content from splitting parts")
|
||||||
return content
|
} else {
|
||||||
}
|
content = body
|
||||||
content := body
|
|
||||||
content = strings.ReplaceAll(content, "\r\n", "<br>\n")
|
|
||||||
content = strings.ReplaceAll(content, "\n", "<br>\n")
|
|
||||||
logger.WithFields(logrus.Fields{
|
logger.WithFields(logrus.Fields{
|
||||||
"contentLength": len(content),
|
"contentLength": len(content),
|
||||||
"fullBody": true,
|
"fullBody": true,
|
||||||
}).Debug("Using full body as content in fallback method")
|
}).Debug("Using full body as content in fallback method")
|
||||||
logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body")
|
logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body (no parts split)")
|
||||||
|
}
|
||||||
|
// Newline conversion and signature stripping will be handled by cleanMessageContent,
|
||||||
|
// which is now called by the callers of fallbackExtractContent.
|
||||||
return content
|
return content
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user