package fetcher import ( "fmt" "io" "net/http" "paraclub-ai-mailer/internal/logger" "strings" "time" "github.com/sirupsen/logrus" "golang.org/x/net/html" ) type Fetcher struct { client *http.Client } func New() *Fetcher { return &Fetcher{ client: &http.Client{ Timeout: 30 * time.Second, }, } } func (f *Fetcher) extractText(htmlContent string) string { logger.WithField("contentLength", len(htmlContent)).Debug("Starting HTML text extraction") doc, err := html.Parse(strings.NewReader(htmlContent)) if err != nil { logger.WithError(err).Debug("Failed to parse HTML, falling back to raw content") return htmlContent } var result strings.Builder var textNodeCount int var extractTextNode func(*html.Node) extractTextNode = func(n *html.Node) { if n.Type == html.TextNode { text := strings.TrimSpace(n.Data) if text != "" { result.WriteString(text) result.WriteString(" ") textNodeCount++ } } for c := n.FirstChild; c != nil; c = c.NextSibling { extractTextNode(c) } } extractTextNode(doc) extracted := strings.TrimSpace(result.String()) logger.WithFields(logrus.Fields{ "textNodeCount": textNodeCount, "extractedLength": len(extracted), }).Debug("Completed HTML text extraction") return extracted } func (f *Fetcher) FetchContent(url string) (string, error) { logger.WithField("url", url).Debug("Starting content fetch") resp, err := f.client.Get(url) if err != nil { logger.WithFields(logrus.Fields{ "url": url, "error": err, }).Error("Failed to fetch URL") return "", err } defer resp.Body.Close() logger.WithFields(logrus.Fields{ "url": url, "statusCode": resp.StatusCode, "contentType": resp.Header.Get("Content-Type"), }).Debug("Received HTTP response") body, err := io.ReadAll(resp.Body) if err != nil { logger.WithFields(logrus.Fields{ "url": url, "error": err, }).Error("Failed to read response body") return "", err } logger.WithFields(logrus.Fields{ "url": url, "bodyLength": len(body), }).Debug("Successfully read response body") content := f.extractText(string(body)) logger.WithFields(logrus.Fields{ "url": url, "extractedLength": len(content), }).Debug("Completed content fetch and extraction") return content, nil } func (f *Fetcher) FetchAllURLs(urls []string) (map[string]string, error) { logger.WithField("urlCount", len(urls)).Debug("Starting batch URL fetch") results := make(map[string]string) var failedUrls []string for _, url := range urls { content, err := f.FetchContent(url) if err != nil { logger.WithFields(logrus.Fields{ "url": url, "error": err, }).Error("Failed to fetch URL in batch") failedUrls = append(failedUrls, url) continue } results[url] = content } if len(failedUrls) > 0 { err := fmt.Errorf("failed to fetch %d URLs: %v", len(failedUrls), failedUrls) logger.WithError(err).Error("Batch URL fetch completed with errors") return nil, err } logger.WithField("successCount", len(results)).Debug("Successfully completed batch URL fetch") return results, nil }