```
package main
import (
"fmt"
"github.com/jackdanger/collectlinks"
"log"
"net/http"
"net/url"
"sync"
"time"
)
type UrlQueue struct {
QueueCh chan string
Visited map[string]bool
Lock *sync.RWMutex
}
func (uq *UrlQueue)IsExist(uri string) (bool,bool) {
uq.Lock.Lock()
defer uq.Lock.Unlock()
val, ok := uq.Visited[uri]
return val, ok
}
func (uq *UrlQueue) Push(uri string) {
uq.Lock.Lock()
defer uq.Lock.Unlock()
//
uq.Visited[uri] = false
}
func (uq *UrlQueue) Set(uri string, isExecute bool) {
uq.Lock.Lock()
defer uq.Lock.Unlock()
//设置为true, 执行完成
uq.Visited[uri] = isExecute
}
var urlQ UrlQueue
var urlCh chan string
var num = 5
func init() {
urlQ = UrlQueue{}
urlQ.QueueCh = make(chan string)
urlQ.Visited = make(map[string]bool)
urlQ.Lock = new(sync.RWMutex)
urlCh = make(chan string, num)
}
func Consumer(wg *sync.WaitGroup) {
defer wg.Done()
client := &http.Client{}
for {
select {
case uri := <- urlCh:
download(client, uri)
default:
log.Println("urlQ.QueueCh no data")
time.Sleep(time.Millisecond * 10)
}
}
}
func main() {
url := "http://www.baidu.com/"
go func() {
urlQ.QueueCh <- url
}()
wg := sync.WaitGroup{}
wg.Add(num)
for i := 0; i < num; i++ {
go Consumer(&wg)
}
for ch := range urlQ.QueueCh{
urlCh <- ch
}
wg.Wait()
}
func download(client *http.Client, url string) {
urlQ.Set(url, true)
req, _ := http.NewRequest("GET", url, nil)
// 自定义Header
req.Header.Set("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)")
resp, err := client.Do(req)
if err != nil {
log.Printf("http get error:%+v", err)
urlQ.Set(url, false)
return
}
//函数结束后关闭相关链接
defer resp.Body.Close()
links := collectlinks.All(resp.Body)
for _, link := range links {
absolute := urlJoin(link, url)
if absolute != " " {
val, ok := urlQ.IsExist(absolute)
log.Printf("val:%v ok:%v\n", val, ok)
if !ok || !val {
//不存在,加入, 或val为false
urlQ.Push(absolute)
log.Printf("parse url:%s\n", absolute)
go func() {
//没来得及消费,就会出现异常
urlQ.QueueCh <- absolute
}()
}
}
}
}
func urlJoin(href, base string) string {
uri, err := url.Parse(href)
if err != nil {
return " "
}
baseUrl, err := url.Parse(base)
if err != nil {
return " "
}
return baseUrl.ResolveReference(uri).String()
}
```
部分代码引用参考:
https://zhuanlan.zhihu.com/p/55039990
有疑问加站长微信联系(非本文作者))