75 lines
1.4 KiB
Go
75 lines
1.4 KiB
Go
package fetcher
|
|
|
|
import (
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
type Fetcher struct {
|
|
client *http.Client
|
|
}
|
|
|
|
func New() *Fetcher {
|
|
return &Fetcher{
|
|
client: &http.Client{
|
|
Timeout: 30 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (f *Fetcher) extractText(htmlContent string) string {
|
|
doc, err := html.Parse(strings.NewReader(htmlContent))
|
|
if err != nil {
|
|
return htmlContent // fallback to raw content if parsing fails
|
|
}
|
|
|
|
var result strings.Builder
|
|
var extractTextNode func(*html.Node)
|
|
extractTextNode = func(n *html.Node) {
|
|
if n.Type == html.TextNode {
|
|
text := strings.TrimSpace(n.Data)
|
|
if text != "" {
|
|
result.WriteString(text)
|
|
result.WriteString(" ")
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
extractTextNode(c)
|
|
}
|
|
}
|
|
|
|
extractTextNode(doc)
|
|
return strings.TrimSpace(result.String())
|
|
}
|
|
|
|
func (f *Fetcher) FetchContent(url string) (string, error) {
|
|
resp, err := f.client.Get(url)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return f.extractText(string(body)), nil
|
|
}
|
|
|
|
func (f *Fetcher) FetchAllURLs(urls []string) (map[string]string, error) {
|
|
results := make(map[string]string)
|
|
for _, url := range urls {
|
|
content, err := f.FetchContent(url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
results[url] = content
|
|
}
|
|
return results, nil
|
|
}
|