使用GO 爬取豆瓣top250并保存成csv格式文件

kqzh · 2019-10-14 10:11:00 · 1336 次点击 · 预计阅读时间 2 分钟 · 大约8小时之前开始浏览

这是一个创建于 2019-10-14 10:11:00 的文章，其中的信息可能已经有所发展或是发生改变。

最终效果传送门

思路

使用 colly.visit 访问目标网页
通过每张页面中 "下一页"的 url 依次访问所有页面
在每一页中使用colly.OnHTML 解析获得需要的数据
利用 go 标准库 encoding/csv 来保存csv格式文件

代码实现

package main

import (
    "encoding/csv"
    "fmt"
    "github.com/gocolly/colly"
    "os"
    "sort"
    "strconv"
)

type Film struct {
    title string
    score float64
}

// 封装结构体进行排序
type Wrapper struct {
    films []Film
    by    func(a, b *Film) bool
}

func main() {

    //创建收集器
    c := colly.NewCollector()
    films := make([]Film, 0, 225)

    //获取分页的url
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
        link := e.Attr("href")
        if e.Text == "后页>" {
            c.Visit(e.Request.AbsoluteURL(link))
        }
    })

    //获取每页的电影信息
    c.OnHTML("div[class=info]", func(e *colly.HTMLElement) {
        t := e.ChildText(".title:nth-child(1)")
        s, _ := strconv.ParseFloat(e.ChildText(".rating_num"), 64)
        films = append(films, Film{
            title: t,
            score: s,
        })
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL.String())
    })

    //开始访问豆瓣
    c.Visit("https://movie.douban.com/top250")

    //将获取的数据按评分排序
    sort.Sort(Wrapper{films, func(a, b *Film) bool {
        return a.score > b.score
    }})

    f, _ := os.Create("films.csv")
    defer f.Close()

    //防止中文乱码
    f.WriteString("\xEF\xBB\xBF")

    writer := csv.NewWriter(f)
    defer writer.Flush()

    //将爬取信息写入csv文件
    writer.Write([]string{"Title", "score"})
    for _, v := range films {
        writer.Write([]string{v.title, strconv.FormatFloat(v.score, 'f', -1, 64)})
    }

}

//重写排序所需的方法
func (pw Wrapper) Len() int { // 重写 Len() 方法
    return len(pw.films)
}
func (pw Wrapper) Swap(i, j int) { // 重写 Swap() 方法
    pw.films[i], pw.films[j] = pw.films[j], pw.films[i]
}
func (pw Wrapper) Less(i, j int) bool { // 重写 Less() 方法
    return pw.by(&pw.films[i], &pw.films[j])
}