看了网上很多个spider版本, 几乎都是使用regexp包正则匹配实现.
其实使用DOC, 性能更好, 也更优雅
package main import ( "fmt" "net/http" "os" "golang.org/x/net/html" ) func visit(links []string, n *html.Node) []string { if n.Type == html.ElementNode && n.Data == 'a' { for _, a := range n.Attr { if a.Key == "href" { links = append(links, a.Val) } } } for c := n.FirstChild; c != nil; c = c.NextSibling { links = visit(links, c) } return links } func main() { for _, url := range os.Args[1:] { links, err := findLinks(url) if err != nil { fmt.Fprintf(os.Stderr, "findlinks2: %v\n", err) continue } for _, link := range links { fmt.Println(link) } } } func findLinks(url string) ([]string, error) { resp, err := http.Get(url) if err != nil { return nil, err } if resp.StatusCode != http.StatusOK { resp.Body.Close() return nil, fmt.Errorf("getting %s: %s", url, resp.Status) } doc, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { return nil, fmt.Errorf("pax resing %s as HTML: %v", url, err) } return visit(nil, doc), nil }
有疑问加站长微信联系(非本文作者)