golang爬虫框架colly

写个代码容易么 · · 2270 次点击 · · 开始浏览

这是一个创建于的文章，其中的信息可能已经有所发展或是发生改变。

第一次，站长亲自招 Gopher 了>>>

colly.png

colly一款快速优雅的golang爬虫框架，简单易用，功能完备。

官网地址：http://go-colly.org/

包地址：import "github.com/gocolly/colly"

一个简单的例子：

package main

import (
    "fmt"
    "github.com/gocolly/colly"
)

func main() {
    c := colly.NewCollector()

    c.OnHTML("a", func(e *colly.HTMLElement) {
        e.Request.Visit(e.Attr("href"))
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL)
    })

    c.Visit("http://go-colly.org/")
}

使用方式概括下来主要有三步：

创建一个采集器
注册回调函数
访问具体网站

创建采集器时可以指定一些配置参数，如useragent，爬取深度及日志等

colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"),
 colly.MaxDepth(1), 
colly.Debugger(&debug.LogDebugger{}))

回调函数共有7中

名称	说明	参数1	参数2
OnRequest	请求前调用	*colly.Request
OnError	请求发生错误时调用	*colly.Response	error
OnResponseHeaders	收到响应头后调用	colly.Response
OnResponse	收到响应后调用	colly.Response
OnHTML	响应内容是HTML时调用	xpath表达式	func(e *colly.HTMLElement)
OnXML	响应内容是XML时调用	xpath表达式	func(e *colly.XMLElement)
OnScraped	在OnXML之后调用	func(r *colly.Response)

OnHTML回调可以注册多个，匹配不同的xpath表达式

1. 爬取简书首页文章列表

通过浏览器开发者工具查看jianshu.com结构如下

colly-jianshu-dom.png

文章列表为ul标签，中间每一项是li标签，li中包含content，content中包含title，abstract和meta标签

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "github.com/gocolly/colly/debug"
)

func main() {
    c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{}))
        //文章列表
    c.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
            //列表中每一项
            e.ForEach("li", func(i int, item *colly.HTMLElement) {
            //文章链接
            href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
            //文章标题
            title := item.ChildText("div[class='content'] > a[class='title']")
            //文章摘要
            summary := item.ChildText("div[class='content'] > p[class='abstract']")
            fmt.Println(title, href)
            fmt.Println(summary)
            fmt.Println()
        })
    })

    err := c.Visit("https://www.jianshu.com")
    if err != nil {
        fmt.Println(err.Error())
    }
}

2.爬取文章列表和详情

文章列表和1方式一样，文章详情通过创建新的采集器访问详情页面

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "time"
)

func main() {
    c1 := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1))
    c2 := c1.Clone()

    //异步
    c2.Async = true
    //限速
    c2.Limit(&colly.LimitRule{
        DomainRegexp: "",
        DomainGlob:   "*.jianshu.com/p/*",
        Delay:        10 * time.Second,
        RandomDelay:  0,
        Parallelism:  1,
    })
    //采集器1，获取文章列表
    c1.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
        e.ForEach("li", func(i int, item *colly.HTMLElement) {
            href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
            title := item.ChildText("div[class='content'] > a[class='title']")
            summary := item.ChildText("div[class='content'] > p[class='abstract']")
            ctx := colly.NewContext()
            ctx.Put("href", href)
            ctx.Put("title", title)
            ctx.Put("summary", summary)
            //通过Context上下文对象将采集器1采集到的数据传递到采集器2
            c2.Request("GET", "https://www.jianshu.com" + href, nil, ctx, nil)
        })
    })
    //采集器2，获取文章详情
    c2.OnHTML("article", func(e *colly.HTMLElement) {
        href := e.Request.Ctx.Get("href")
        title := e.Request.Ctx.Get("title")
        summary := e.Request.Ctx.Get("summary")
        detail := e.Text

        fmt.Println("----------" + title + "----------")
        fmt.Println(href)
        fmt.Println(summary)
        fmt.Println(detail)
        fmt.Println()
    })

    c2.OnRequest(func(r *colly.Request) {
        fmt.Println("c2爬取页面：", r.URL)
    })

    c1.OnRequest(func(r *colly.Request) {
        fmt.Println("c1爬取页面：", r.URL)
    })

    c1.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
    })

    err := c1.Visit("https://www.jianshu.com")
    if err != nil {
        fmt.Println(err.Error())
    }

    c2.Wait()
}

3. 爬取需要登录的网页

官网提供登录页处理的例子，但是大多数涉及验证码，不好处理，目前方式是手动登录，复制cookie写到爬虫请求头里

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "github.com/gocolly/colly/debug"
    "github.com/gocolly/colly/extensions"
    _ "github.com/gocolly/colly/extensions"
    "net/http"
)

func main() {
    url := "https://www.jianshu.com"

    c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{}))
    c.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
        e.ForEach("li", func(i int, item *colly.HTMLElement) {
            href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
            title := item.ChildText("div[class='content'] > a[class='title']")
            summary := item.ChildText("div[class='content'] > p[class='abstract']")
            fmt.Println(title, href)
            fmt.Println(summary)
            fmt.Println()
        })
    })

    //设置随机useragent
    extensions.RandomUserAgent(c)
    //设置登录cookie
    c.SetCookies(url, []*http.Cookie{
        &http.Cookie{
            Name:     "remember_user_token",
            Value:    "wNDUxOV0sIiQyYSQxMSRwdkhqWVhHYmxXaDJ6dEU3NzJwbmsuIiwiMTU",
            Path:     "/",
            Domain:   ".jianshu.com",
            Secure:   true,
            HttpOnly: true,
        },
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("爬取页面：", r.URL)
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
    })
    err := c.Visit(url)
    if err != nil {
        fmt.Println(err.Error())
    }
}

4. 内存任务队列

将需要爬取的连接放入队列中，设置队列并发数，可以并行爬取连接

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "github.com/gocolly/colly/debug"
    "github.com/gocolly/colly/queue"
)

func main() {
    c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(3), colly.Debugger(&debug.LogDebugger{}))

//创建内存队列，大小10000，goroutine数量 5
    q, _ := queue.New(5, &queue.InMemoryQueueStorage{MaxSize: 10000})

    c.OnHTML("a", func(element *colly.HTMLElement) {
        element.Request.Visit(element.Attr("href"))
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("爬取页面：", r.URL)
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
    })
    q.AddURL("https://www.jianshu.com")

    q.Run(c)
}

5. redis任务队列

设置redis存储后，队列中URL存储在redis中，访问页面的cookie及访问记录也会保存在redis中

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "github.com/gocolly/colly/debug"
    "github.com/gocolly/colly/queue"
    "github.com/gocolly/redisstorage"
)

func main() {
    c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(3), colly.Debugger(&debug.LogDebugger{}))

    storage := &redisstorage.Storage{
        Address:  "192.168.1.10:6379",
        Password: "123456",
        DB:       0,
        Prefix:   "colly",
        Client:   nil,
        Expires:  0,
    }

    c.SetStorage(storage)

    err := storage.Clear()
    if err != nil{
        panic(err)
    }

    defer storage.Client.Close()

    q, _ := queue.New(5, storage)

    c.OnHTML("a", func(element *colly.HTMLElement) {
        element.Request.Visit(element.Attr("href"))
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("爬取页面：", r.URL)
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
    })
    q.AddURL("https://www.jianshu.com")

    q.Run(c)
}

redis中数据

6.配置代理

package main

import (
    "bytes"
    "log"

    "github.com/gocolly/colly"
    "github.com/gocolly/colly/proxy"
)

func main() {
    c := colly.NewCollector()

    //配置两个代理
    rp, err := proxy.RoundRobinProxySwitcher("http://127.0.0.1:1080", "socks5://127.0.0.1:1338")
    if err != nil {
        log.Fatal(err)
    }
    c.SetProxyFunc(rp)

    c.OnResponse(func(r *colly.Response) {
        log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1))
    })

    c.Visit("https://httpbin.org/ip")
}

有疑问加站长微信联系（非本文作者）

本文来自：简书

感谢作者：写个代码容易么

查看原文：golang爬虫框架colly

入群交流（和以上内容无关）：加入Go大咖交流群，或添加微信：liuxiaoyan-s 备注：入群；或加QQ群：692541889

2270 次点击

加入收藏微博

收入我的专栏

上一篇：理解Golang的goroutine和channel

下一篇：Serverless 衔接 Kafka 上下游数据流转实战

github

采集器

框架

redis

0 回复

添加一条新回复（您需要登录后才能回复没有账号？）

请尽量让自己的回复能够对别人有帮助
支持 Markdown 格式, **粗体**、~~删除线~~、`单行代码`
支持 @ 本站用户；支持表情（输入 : 提示），见 Emoji cheat sheet
图片支持拖拽、截图粘贴等方式上传

关注我

扫码关注领全套学习资料
加入 QQ 群：
- 192706294（已满）
- 731990104（已满）
- 798786647（已满）
- 729884609（已满）
- 977810755（已满）
- 815126783（已满）
- 812540095（已满）
- 1006366459（已满）
- 692541889
加入微信群：liuxiaoyan-s，备注入群
也欢迎加入知识星球 Go粉丝们（免费）

golang爬虫框架colly

1. 爬取简书首页文章列表

2.爬取文章列表和详情

3. 爬取需要登录的网页

4. 内存任务队列

5. redis任务队列

6.配置代理

用户登录

今日阅读排行

一周阅读排行

关注我

1. 爬取简书首页文章列表

2.爬取文章列表和详情

3. 爬取需要登录的网页

4. 内存任务队列

5. redis任务队列

6.配置代理

golang爬虫框架colly

1. 爬取简书首页文章列表

2.爬取文章列表和详情

3. 爬取需要登录的网页

4. 内存任务队列

5. redis任务队列

6.配置代理

用户登录

今日阅读排行

一周阅读排行

关注我

给该专栏投稿 写篇新文章

收入到我管理的专栏 新建专栏

1. 爬取简书首页文章列表

2.爬取文章列表和详情

3. 爬取需要登录的网页

4. 内存任务队列

5. redis任务队列

6.配置代理

给该专栏投稿写篇新文章

收入到我管理的专栏新建专栏