improvements

This commit is contained in:
2025-03-01 05:11:04 +01:00
parent 8b7e1e59d5
commit f349057975
5 changed files with 73 additions and 16 deletions

View File

@@ -3,7 +3,10 @@ package fetcher
import (
"io"
"net/http"
"strings"
"time"
"golang.org/x/net/html"
)
type Fetcher struct {
@@ -18,6 +21,31 @@ func New() *Fetcher {
}
}
func (f *Fetcher) extractText(htmlContent string) string {
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
return htmlContent // fallback to raw content if parsing fails
}
var result strings.Builder
var extractTextNode func(*html.Node)
extractTextNode = func(n *html.Node) {
if n.Type == html.TextNode {
text := strings.TrimSpace(n.Data)
if text != "" {
result.WriteString(text)
result.WriteString(" ")
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
extractTextNode(c)
}
}
extractTextNode(doc)
return strings.TrimSpace(result.String())
}
func (f *Fetcher) FetchContent(url string) (string, error) {
resp, err := f.client.Get(url)
if err != nil {
@@ -30,12 +58,11 @@ func (f *Fetcher) FetchContent(url string) (string, error) {
return "", err
}
return string(body), nil
return f.extractText(string(body)), nil
}
func (f *Fetcher) FetchAllURLs(urls []string) (map[string]string, error) {
results := make(map[string]string)
for _, url := range urls {
content, err := f.FetchContent(url)
if err != nil {
@@ -43,6 +70,5 @@ func (f *Fetcher) FetchAllURLs(urls []string) (map[string]string, error) {
}
results[url] = content
}
return results, nil
}