feat: Improve email content extraction by enhancing header stripping and cleaning logic

feat: Enhance AI response generation and improve email content extraction with detailed logging
2025-05-29 20:16:12 +02:00 · 2025-05-29 19:47:39 +02:00
3 changed files with 125 additions and 46 deletions
--- a/internal/ai/ai.go
+++ b/internal/ai/ai.go
@ -89,11 +89,12 @@ func (a *AI) GenerateReply(emailContent string, contextContent map[string]string

 	// Prepare the system message with language-specific instruction
 	systemMsg := fmt.Sprintf(`You are a helpful assistant who responds to emails.
-Your primary goal is to answer the user's query based on the provided email and context.
+Your primary goal is to answer the user's query (found in the 'Email Body') by primarily using the information available in the 'Additional Context' and your general knowledge.
+While the 'Email Body' provides the question, your answer should be synthesized from the context and your understanding, not by directly repeating or solely relying on unverified information from the 'Email Body' itself.

 Instructions:
 - Language: Your response must be entirely in %s, regardless of the language used in the email content or context.
- Format: Your reply must be raw HTML. Use appropriate HTML tags for structure and styling.
+- Format: CRITICAL: Your reply MUST be raw HTML. Use appropriate HTML tags for structure and styling. For example, wrap paragraphs in <p>...</p> tags and use <br> for line breaks if needed within a paragraph. Even a short sentence must be wrapped in HTML (e.g., <p>Yes.</p>).
 - Markdown: Do NOT wrap the HTML in markdown code blocks (e.g., %s).
 - Extraneous Text: Do not include a subject line. Do not include explanations, commentary, or any extra text that is not part of the direct answer.
 - Closing: Avoid generic closing statements like "If you have further questions...". Focus solely on answering the email.
@ -108,7 +109,13 @@ Instructions:
 		{Role: "user", Content: userMsg},
 	}

-	return a.makeAPIRequest(messages)
+	aiResponse, err := a.makeAPIRequest(messages)
+	if err != nil {
+		// Error already logged by makeAPIRequest, just propagate
+		return "", err
+	}
+	logger.WithField("rawAIResponse", aiResponse).Debug("Received raw response from AI")
+	return aiResponse, nil
 }

 func (a *AI) makeAPIRequest(messages []Message) (string, error) {
--- a/internal/imap/imap.go
+++ b/internal/imap/imap.go
@ -6,6 +6,7 @@ import (
 	"io"
 	"mime"
 	"mime/multipart"
+	"mime/quotedprintable"
 	"net/mail"
 	"paraclub-ai-mailer/config"
 	"paraclub-ai-mailer/internal/logger"
@ -73,6 +74,7 @@ type Email struct {
 	ID      string
 	Subject string
 	From    string
+	Date    time.Time
 	Body    string
 }

@ -194,6 +196,7 @@ func (ic *IMAPClient) FetchUnprocessedEmails() ([]Email, error) {
 			ID:      msg.Envelope.MessageId,
 			Subject: msg.Envelope.Subject,
 			From:    from,
+			Date:    msg.Envelope.Date,
 			Body:    bodyBuilder.String(),
 		})
 	}
@ -259,7 +262,7 @@ func (ic *IMAPClient) SaveDraft(email Email, response string) (err error) {
 		response,
 		email.From,
 		email.Subject,
-		time.Now().Format("Mon, 02 Jan 2006 15:04:05 -0700"),
+		email.Date.Format("Mon, 02 Jan 2006 15:04:05 -0700"),
 		extractMessageContent(email.Body))

 	literal := &MessageLiteral{
@ -285,17 +288,20 @@ func (ic *IMAPClient) SaveDraft(email Email, response string) (err error) {
 // by removing email headers and MIME boundaries
 func extractMessageContent(body string) string {
 	logger.WithField("bodyLength", len(body)).Debug("Starting message content extraction")
+	logger.WithField("rawInputBody", body).Debug("extractMessageContent: Raw input body")
 	msg, err := mail.ReadMessage(strings.NewReader(body))
 	if err != nil {
 		logger.WithFields(logrus.Fields{
 			"error":      err,
 			"bodyLength": len(body),
 		}).Debug("Failed to parse email message, falling back to simple extraction")
-		return fallbackExtractContent(body)
+		// When ReadMessage fails, the body is raw, so header stripping is needed.
+		return cleanMessageContent(fallbackExtractContent(body), true)
 	}

 	contentTypeHeader := msg.Header.Get("Content-Type")
 	logger.WithField("contentTypeHeader", contentTypeHeader).Debug("Got Content-Type header")
+	logger.WithField("parsedContentTypeHeader", contentTypeHeader).Debug("extractMessageContent: Parsed Content-Type header")

 	mediaType, params, err := mime.ParseMediaType(contentTypeHeader)
 	if err != nil {
@ -303,34 +309,61 @@ func extractMessageContent(body string) string {
 			"error":             err,
 			"contentTypeHeader": contentTypeHeader,
 		}).Debug("Failed to parse Content-Type header, falling back to simple extraction")
-		return fallbackExtractContent(body)
+		// When ParseMediaType fails, the body is raw, so header stripping is needed.
+		return cleanMessageContent(fallbackExtractContent(body), true)
 	}

 	logger.WithFields(logrus.Fields{
 		"mediaType": mediaType,
 		"params":    params,
 	}).Debug("Parsed message Content-Type")
+	logger.WithFields(logrus.Fields{
+		"mediaType": mediaType,
+		"params":    params,
+	}).Debug("extractMessageContent: Parsed mediaType and params")

 	var content string
 	if strings.HasPrefix(mediaType, "multipart/") {
+		// For multipart, the handling of Content-Transfer-Encoding will be done within handleMultipartMessage for each part
+		logger.Debug("extractMessageContent: Handling as multipart message")
 		content = handleMultipartMessage(msg.Body, params["boundary"])
+		logger.WithField("contentFromMultipart", content).Debug("extractMessageContent: Content after handleMultipartMessage")
 	} else {
-		content = handleSinglePartMessage(msg.Body)
+		// For single part, handle Content-Transfer-Encoding here
+		var partReader io.Reader = msg.Body
+		transferEncoding := strings.ToLower(msg.Header.Get("Content-Transfer-Encoding"))
+		if transferEncoding == "quoted-printable" {
+			partReader = quotedprintable.NewReader(msg.Body)
+		}
+		// Add handling for "base64" if needed in the future
+		// else if transferEncoding == "base64" {
+		// 	partReader = base64.NewDecoder(base64.StdEncoding, msg.Body)
+		// }
+		logger.Debug("extractMessageContent: Handling as single part message")
+		content = handleSinglePartMessage(partReader)
+		logger.WithField("contentFromSinglePart", content).Debug("extractMessageContent: Content after handleSinglePartMessage")
 	}

 	if content == "" {
 		logger.Debug("No content extracted, falling back to simple extraction")
-		return fallbackExtractContent(body)
+		logger.Debug("extractMessageContent: No content from primary extraction, falling back.")
+		// When primary extraction yields no content, the body is raw, so header stripping is needed.
+		return cleanMessageContent(fallbackExtractContent(body), true)
 	}

 	// Clean up the content
-	content = cleanMessageContent(content)
+	logger.WithField("contentBeforeClean", content).Debug("extractMessageContent: Content before cleanMessageContent")
+	// When ReadMessage succeeds, 'content' is already the message body (or part body),
+	// so no further header stripping should be done.
+	content = cleanMessageContent(content, false)
+	logger.WithField("contentAfterClean", content).Debug("extractMessageContent: Content after cleanMessageContent")

 	logger.WithField("contentLength", len(content)).Debug("Successfully extracted and cleaned message content")
 	return content
 }

 func handleMultipartMessage(reader io.Reader, boundary string) string {
+	logger.WithField("boundary", boundary).Debug("handleMultipartMessage: Starting with boundary")
 	if boundary == "" {
 		logger.Debug("No boundary found in multipart message")
 		return ""
@ -351,89 +384,128 @@ func handleMultipartMessage(reader io.Reader, boundary string) string {
 		}

 		contentType := part.Header.Get("Content-Type")
+		contentTransferEncoding := strings.ToLower(part.Header.Get("Content-Transfer-Encoding"))
+		logger.WithFields(logrus.Fields{
+			"partIndex":            partIndex,
+			"partContentType":      contentType,
+			"partTransferEncoding": contentTransferEncoding,
+			"partHeaders":          part.Header,
+		}).Debug("handleMultipartMessage: Processing part")
+
 		if strings.HasPrefix(contentType, "text/plain") {
+			var partReader io.Reader = part
+			if contentTransferEncoding == "quoted-printable" {
+				partReader = quotedprintable.NewReader(part)
+			}
+			// Add handling for "base64" if needed in the future
+			// else if contentTransferEncoding == "base64" {
+			// 	partReader = base64.NewDecoder(base64.StdEncoding, part)
+			// }
+
 			buf := new(bytes.Buffer)
-			if _, err := buf.ReadFrom(part); err != nil {
-				continue
+			if _, err := buf.ReadFrom(partReader); err != nil {
+				logger.WithError(err).WithField("partIndex", partIndex).Debug("Failed to read from partReader in multipart")
+				continue // Or handle error more robustly
 			}
 			textContent = buf.String()
+			// Assuming we only care about the first text/plain part found
+			// If multiple text/plain parts could exist and need concatenation, this logic would need adjustment.
 			break
 		}
 		partIndex++
 	}

+	logger.WithField("textContentResult", textContent).Debug("handleMultipartMessage: Returning textContent")
 	return textContent
 }

 func handleSinglePartMessage(reader io.Reader) string {
+	logger.Debug("handleSinglePartMessage: Starting")
 	buf := new(bytes.Buffer)
 	if _, err := buf.ReadFrom(reader); err != nil {
 		logger.WithError(err).Debug("Failed to read message body")
 		return ""
 	}
-	return buf.String()
+	content := buf.String()
+	logger.WithField("readContent", content).Debug("handleSinglePartMessage: Content read from reader")
+	return content
 }

-func cleanMessageContent(content string) string {
-	// Remove any remaining email headers that might be in the body
-	lines := strings.Split(content, "\n")
-	var cleanLines []string
-	headerSection := true
+func cleanMessageContent(content string, performHeaderStripping bool) string {
+	logger.WithField("inputContentLength", len(content)).Debug("cleanMessageContent: Starting")
+	logger.WithField("performHeaderStripping", performHeaderStripping).Debug("cleanMessageContent: performHeaderStripping flag")

-	for _, line := range lines {
-		trimmed := strings.TrimSpace(line)
+	if performHeaderStripping {
+		// Remove any remaining email headers that might be in the body
+		lines := strings.Split(content, "\n")
+		var cleanLines []string
+		headerSection := true

-		// Empty line marks the end of headers
-		if headerSection && trimmed == "" {
-			headerSection = false
-			continue
-		}
+		for _, line := range lines {
+			trimmed := strings.TrimSpace(line)

-		// Skip header lines
-		if headerSection && (strings.Contains(trimmed, ":") || trimmed == "") {
-			continue
-		}
+			// Empty line marks the end of headers
+			if headerSection && trimmed == "" {
+				headerSection = false
+				continue
+			}

-		// Add non-header lines
-		if !headerSection {
+			// Skip header lines
+			if headerSection && (strings.Contains(trimmed, ":") || trimmed == "") {
+				continue
+			}
+
+			// Add non-header lines
+			// This condition was originally !headerSection, but if we are past the headers,
+			// we should always add the line. If headerSection is still true here, it means
+			// it's the first line of content after potential headers were skipped.
 			cleanLines = append(cleanLines, line)
 		}
+		content = strings.Join(cleanLines, "\n")
+		logger.WithField("contentAfterHeaderStripLength", len(content)).Debug("cleanMessageContent: Content after header stripping")
+	} else {
+		logger.Debug("cleanMessageContent: Skipping header stripping")
 	}

-	content = strings.Join(cleanLines, "\n")
-
 	// Convert newlines to HTML breaks for display
-	content = strings.ReplaceAll(content, "\r\n", "<br>\n")
+	// First, normalize all newlines (\r\n or \n) to just \n
+	content = strings.ReplaceAll(content, "\r\n", "\n")
+	// Then, replace each \n with <br>\n
 	content = strings.ReplaceAll(content, "\n", "<br>\n")
+	logger.WithField("contentAfterNewlineConversionLength", len(content)).Debug("cleanMessageContent: Content after newline conversion")

 	// Remove any remaining email signature markers
 	content = strings.Split(content, "\n-- ")[0]
 	content = strings.Split(content, "<br>-- ")[0]

-	return strings.TrimSpace(content)
+	finalContent := strings.TrimSpace(content)
+	logger.WithField("finalCleanedContentLength", len(finalContent)).Debug("cleanMessageContent: Returning final cleaned content")
+	return finalContent
 }

 // fallbackExtractContent is the previous implementation used as fallback
 func fallbackExtractContent(body string) string {
 	logger.WithField("bodyLength", len(body)).Debug("Using fallback content extraction method")
+	logger.WithField("rawInputBodyFallbackLength", len(body)).Debug("fallbackExtractContent: Raw input body")
+	var content string
 	parts := strings.Split(body, "\r\n\r\n")
 	if len(parts) > 1 {
-		content := strings.Join(parts[1:], "\r\n\r\n")
-		content = strings.ReplaceAll(content, "\r\n", "<br>\n")
-		content = strings.ReplaceAll(content, "\n", "<br>\n")
+		content = strings.Join(parts[1:], "\r\n\r\n")
 		logger.WithFields(logrus.Fields{
 			"contentLength": len(content),
 			"partsCount":    len(parts),
-		}).Debug("Successfully extracted content using fallback method")
-		return content
+		}).Debug("Successfully extracted content using fallback method (from parts)")
+		logger.WithField("extractedContentFallbackLength", len(content)).Debug("fallbackExtractContent: Content from splitting parts")
+	} else {
+		content = body
+		logger.WithFields(logrus.Fields{
+			"contentLength": len(content),
+			"fullBody":      true,
+		}).Debug("Using full body as content in fallback method")
+		logger.WithField("finalContentFallbackLength", len(content)).Debug("fallbackExtractContent: Final content from full body (no parts split)")
 	}
-	content := body
-	content = strings.ReplaceAll(content, "\r\n", "<br>\n")
-	content = strings.ReplaceAll(content, "\n", "<br>\n")
-	logger.WithFields(logrus.Fields{
-		"contentLength": len(content),
-		"fullBody":      true,
-	}).Debug("Using full body as content in fallback method")
+	// Newline conversion and signature stripping will be handled by cleanMessageContent,
+	// which is now called by the callers of fallbackExtractContent.
 	return content
 }

--- a/BIN
+++ b/BIN
Author	SHA1	Message	Date
Dominik Polakovics	e248e26b2d	feat: Improve email content extraction by enhancing header stripping and cleaning logic	2025-05-29 20:16:12 +02:00
Dominik Polakovics	1756f3462b	feat: Enhance AI response generation and improve email content extraction with detailed logging	2025-05-29 19:47:39 +02:00