improvements
This commit is contained in:
@@ -3,7 +3,10 @@ package fetcher
|
||||
import (
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
type Fetcher struct {
|
||||
@@ -18,6 +21,31 @@ func New() *Fetcher {
|
||||
}
|
||||
}
|
||||
|
||||
func (f *Fetcher) extractText(htmlContent string) string {
|
||||
doc, err := html.Parse(strings.NewReader(htmlContent))
|
||||
if err != nil {
|
||||
return htmlContent // fallback to raw content if parsing fails
|
||||
}
|
||||
|
||||
var result strings.Builder
|
||||
var extractTextNode func(*html.Node)
|
||||
extractTextNode = func(n *html.Node) {
|
||||
if n.Type == html.TextNode {
|
||||
text := strings.TrimSpace(n.Data)
|
||||
if text != "" {
|
||||
result.WriteString(text)
|
||||
result.WriteString(" ")
|
||||
}
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
extractTextNode(c)
|
||||
}
|
||||
}
|
||||
|
||||
extractTextNode(doc)
|
||||
return strings.TrimSpace(result.String())
|
||||
}
|
||||
|
||||
func (f *Fetcher) FetchContent(url string) (string, error) {
|
||||
resp, err := f.client.Get(url)
|
||||
if err != nil {
|
||||
@@ -30,12 +58,11 @@ func (f *Fetcher) FetchContent(url string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return string(body), nil
|
||||
return f.extractText(string(body)), nil
|
||||
}
|
||||
|
||||
func (f *Fetcher) FetchAllURLs(urls []string) (map[string]string, error) {
|
||||
results := make(map[string]string)
|
||||
|
||||
for _, url := range urls {
|
||||
content, err := f.FetchContent(url)
|
||||
if err != nil {
|
||||
@@ -43,6 +70,5 @@ func (f *Fetcher) FetchAllURLs(urls []string) (map[string]string, error) {
|
||||
}
|
||||
results[url] = content
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user