136 lines
3.1 KiB
Go
136 lines
3.1 KiB
Go
package fetcher
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"paraclub-ai-mailer/internal/logger"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
type Fetcher struct {
|
|
client *http.Client
|
|
}
|
|
|
|
func New() *Fetcher {
|
|
return &Fetcher{
|
|
client: &http.Client{
|
|
Timeout: 30 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (f *Fetcher) extractText(htmlContent string) string {
|
|
logger.WithField("contentLength", len(htmlContent)).Debug("Starting HTML text extraction")
|
|
|
|
doc, err := html.Parse(strings.NewReader(htmlContent))
|
|
if err != nil {
|
|
logger.WithError(err).Debug("Failed to parse HTML, falling back to raw content")
|
|
return htmlContent
|
|
}
|
|
|
|
var result strings.Builder
|
|
var textNodeCount int
|
|
|
|
var extractTextNode func(*html.Node)
|
|
extractTextNode = func(n *html.Node) {
|
|
if n.Type == html.TextNode {
|
|
text := strings.TrimSpace(n.Data)
|
|
if text != "" {
|
|
result.WriteString(text)
|
|
result.WriteString(" ")
|
|
textNodeCount++
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
extractTextNode(c)
|
|
}
|
|
}
|
|
|
|
extractTextNode(doc)
|
|
extracted := strings.TrimSpace(result.String())
|
|
|
|
logger.WithFields(logrus.Fields{
|
|
"textNodeCount": textNodeCount,
|
|
"extractedLength": len(extracted),
|
|
}).Debug("Completed HTML text extraction")
|
|
|
|
return extracted
|
|
}
|
|
|
|
func (f *Fetcher) FetchContent(url string) (string, error) {
|
|
logger.WithField("url", url).Debug("Starting content fetch")
|
|
|
|
resp, err := f.client.Get(url)
|
|
if err != nil {
|
|
logger.WithFields(logrus.Fields{
|
|
"url": url,
|
|
"error": err,
|
|
}).Error("Failed to fetch URL")
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
logger.WithFields(logrus.Fields{
|
|
"url": url,
|
|
"statusCode": resp.StatusCode,
|
|
"contentType": resp.Header.Get("Content-Type"),
|
|
}).Debug("Received HTTP response")
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
logger.WithFields(logrus.Fields{
|
|
"url": url,
|
|
"error": err,
|
|
}).Error("Failed to read response body")
|
|
return "", err
|
|
}
|
|
|
|
logger.WithFields(logrus.Fields{
|
|
"url": url,
|
|
"bodyLength": len(body),
|
|
}).Debug("Successfully read response body")
|
|
|
|
content := f.extractText(string(body))
|
|
|
|
logger.WithFields(logrus.Fields{
|
|
"url": url,
|
|
"extractedLength": len(content),
|
|
}).Debug("Completed content fetch and extraction")
|
|
|
|
return content, nil
|
|
}
|
|
|
|
func (f *Fetcher) FetchAllURLs(urls []string) (map[string]string, error) {
|
|
logger.WithField("urlCount", len(urls)).Debug("Starting batch URL fetch")
|
|
|
|
results := make(map[string]string)
|
|
var failedUrls []string
|
|
|
|
for _, url := range urls {
|
|
content, err := f.FetchContent(url)
|
|
if err != nil {
|
|
logger.WithFields(logrus.Fields{
|
|
"url": url,
|
|
"error": err,
|
|
}).Error("Failed to fetch URL in batch")
|
|
failedUrls = append(failedUrls, url)
|
|
continue
|
|
}
|
|
results[url] = content
|
|
}
|
|
|
|
if len(failedUrls) > 0 {
|
|
err := fmt.Errorf("failed to fetch %d URLs: %v", len(failedUrls), failedUrls)
|
|
logger.WithError(err).Error("Batch URL fetch completed with errors")
|
|
return nil, err
|
|
}
|
|
|
|
logger.WithField("successCount", len(results)).Debug("Successfully completed batch URL fetch")
|
|
return results, nil
|
|
}
|