使用golang抓取京东全部商品分类信息

pssmart · 2016-05-26 04:08:33 · 4245 次点击 · 大约8小时之前 开始浏览    置顶
这是一个创建于 2016-05-26 04:08:33 的主题,其中的信息可能已经有所发展或是发生改变。

package main

import ( // "errors" "fmt" "io/ioutil" "net/http" "os" "regexp" "strings" )

type Mall struct { name string cat []*Catagory }

type Catagory struct { id int64 name string link string subCat []*SubCatagory }

type SubCatagory struct { id int64 name string link string detailCat []*DetailCatagory }

type DetailCatagory struct { id int64 name string link string goods map[string]interface{} }

var Jd = Mall{name: "Jd"}

var topCatagoryStart = regexp.MustCompile([[:space:]]*\) var topCatagoryFetch = regexp.MustCompile([[:space:]]*.*\(?P.*)\) var topCatagoryEnd = regexp.MustCompile([[:space:]]*\) var subCatagoryFetch = regexp.MustCompile(``)

var detailCatagoryStart = regexp.MustCompile([[:space:]]*\\.*)\"[[:space:]]+target="_blank"\>(?P[\p{Han}]+)\\)

var detailCatagoryFetch = regexp.MustCompile([[:space:]]*.*\.*)\"[[:space:]]+target="_blank"\>(?P[\p{Han}]+)\)

func dumpJdCatagory(mall *Mall) { fmt.Println(mall.name) for , c := range Jd.cat { fmt.Println(c) fmt.Printf("Catagory: %s\n", c.name) for , sc := range c.subCat { fmt.Printf("SubCatagory: %s, Link: %s\n", sc.name, sc.link) for _, dc := range sc.detailCat { fmt.Printf("DetailCatagory: %s. Link: %s\n", dc.name, dc.link) } } } }

func main() { resp, err := http.Get("http://www.jd.com/allSort.aspx") if err != nil { panic(err.Error()) }

defer resp.Body.Close()

body, err := ioutil.ReadAll(resp.Body)
if err != nil {
    panic(err.Error())
}
//    fmt.Println(string(body))

file, err := os.Create("jd_list.html")
if err != nil {
    panic(err.Error())
}

_, err = file.Write(body)
if err != nil {
    panic(err.Error())
}

Jd.cat = make([]*Catagory, 0, 100)
var top = false
var sub = false
var detail = false
var cat *Catagory
var subCat *SubCatagory
var detailCat *DetailCatagory
s := strings.Split(string(body), "\n")
for _, line := range s {
    if topCatagoryStart.MatchString(line) {
        top = true
        sub = false
        detail = false
    }
    if top == true {
        if topCatagoryFetch.MatchString(line) {
            sub = true
            /*
                fmt.Println(topCatagoryFetch.FindStringSubmatch(line)[1])
                    cat = &Catagory{name: topCatagoryFetch.FindStringSubmatch(line)[1]}
                    cat.subCat = make([]*SubCatagory, 40, 100)
            */
            cat = new(Catagory)
            cat.name = topCatagoryFetch.FindStringSubmatch(line)[1]
            cat.subCat = make([]*SubCatagory, 0, 100)
            Jd.cat = append(Jd.cat, cat)
            //fmt.Println("Catagory")
            //fmt.Println(cat)
        }
    }

    if sub == true {
        if detailCatagoryStart.MatchString(line) {
            /*
                fmt.Println(detailCatagoryStart.FindStringSubmatch(line)[1])
                fmt.Println(detailCatagoryStart.FindStringSubmatch(line)[2])
                    subCat = &SubCatagory{name: detailCatagoryStart.FindStringSubmatch(line)[2], link: detailCatagoryStart.FindStringSubmatch(line)[1]}
                    subCat.detailCat = make([]*DetailCatagory, 50, 100)
            */
            subCat = new(SubCatagory)
            subCat.name = detailCatagoryStart.FindStringSubmatch(line)[2]
            subCat.link = detailCatagoryStart.FindStringSubmatch(line)[1]
            subCat.detailCat = make([]*DetailCatagory, 0, 100)
            cat.subCat = append(cat.subCat, subCat)
            //fmt.Println("SubCatagory")
            //fmt.Println(subCat)
            detail = true
        }
    }

    if detail == true {
        if detailCatagoryFetch.MatchString(line) {
            /*
                fmt.Println(detailCatagoryFetch.FindStringSubmatch(line)[1])
                fmt.Println(detailCatagoryFetch.FindStringSubmatch(line)[2])
                    detailCat = &DetailCatagory{name: detailCatagoryFetch.FindStringSubmatch(line)[2], link: detailCatagoryFetch.FindStringSubmatch(line)[1]}
            */
            detailCat = new(DetailCatagory)
            detailCat.name = detailCatagoryFetch.FindStringSubmatch(line)[2]
            detailCat.link = detailCatagoryFetch.FindStringSubmatch(line)[1]
            subCat.detailCat = append(subCat.detailCat, detailCat)
            //fmt.Println("DetailCatagory")
            //                fmt.Println(detailCat)

        }
        if topCatagoryEnd.MatchString(line) {
            top = false
            sub = false
            detail = false
        }
    }
}

dumpJdCatagory(&Jd)
//fmt.Println(s)
//fmt.Printf("%d bytes has been write to jd_list.html", n)

}


有疑问加站长微信联系(非本文作者)

入群交流(和以上内容无关):加入Go大咖交流群,或添加微信:liuxiaoyan-s 备注:入群;或加QQ群:692541889

4245 次点击  
加入收藏 微博
暂无回复
添加一条新回复 (您需要 登录 后才能回复 没有账号 ?)
  • 请尽量让自己的回复能够对别人有帮助
  • 支持 Markdown 格式, **粗体**、~~删除线~~、`单行代码`
  • 支持 @ 本站用户;支持表情(输入 : 提示),见 Emoji cheat sheet
  • 图片支持拖拽、截图粘贴等方式上传