使用golang抓取京东全部商品分类信息

pssmart · · 2187 次点击 · 开始浏览    置顶
这是一个创建于 的主题,其中的信息可能已经有所发展或是发生改变。
package main import ( // "errors" "fmt" "io/ioutil" "net/http" "os" "regexp" "strings" ) type Mall struct { name string cat []*Catagory } type Catagory struct { id int64 name string link string subCat []*SubCatagory } type SubCatagory struct { id int64 name string link string detailCat []*DetailCatagory } type DetailCatagory struct { id int64 name string link string goods map[string]interface{} } var Jd = Mall{name: "Jd"} var topCatagoryStart = regexp.MustCompile(`[[:space:]]*\<div[[:space:]]+class="category-item m"\>`) var topCatagoryFetch = regexp.MustCompile(`[[:space:]]*.*\<span\>(?P<topC>.*)\</span\>`) var topCatagoryEnd = regexp.MustCompile(`[[:space:]]*\</div\>`) var subCatagoryFetch = regexp.MustCompile(``) var detailCatagoryStart = regexp.MustCompile(`[[:space:]]*\<dt\>\<a[[:space:]]+href=\"//(?P<cat2link>.*)\"[[:space:]]+target="_blank"\>(?P<cat2name>[\p{Han}]+)\</a\>\</dt\>`) var detailCatagoryFetch = regexp.MustCompile(`[[:space:]]*.*\<a[[:space:]]+href=\"//(?P<cat2link>.*)\"[[:space:]]+target="_blank"\>(?P<cat2name>[\p{Han}]+)\</a\>`) func dumpJdCatagory(mall *Mall) { fmt.Println(mall.name) for _, c := range Jd.cat { fmt.Println(c) fmt.Printf("Catagory: %s\n", c.name) for _, sc := range c.subCat { fmt.Printf("SubCatagory: %s, Link: %s\n", sc.name, sc.link) for _, dc := range sc.detailCat { fmt.Printf("DetailCatagory: %s. Link: %s\n", dc.name, dc.link) } } } } func main() { resp, err := http.Get("http://www.jd.com/allSort.aspx") if err != nil { panic(err.Error()) } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { panic(err.Error()) } // fmt.Println(string(body)) file, err := os.Create("jd_list.html") if err != nil { panic(err.Error()) } _, err = file.Write(body) if err != nil { panic(err.Error()) } Jd.cat = make([]*Catagory, 0, 100) var top = false var sub = false var detail = false var cat *Catagory var subCat *SubCatagory var detailCat *DetailCatagory s := strings.Split(string(body), "\n") for _, line := range s { if topCatagoryStart.MatchString(line) { top = true sub = false detail = false } if top == true { if topCatagoryFetch.MatchString(line) { sub = true /* fmt.Println(topCatagoryFetch.FindStringSubmatch(line)[1]) cat = &Catagory{name: topCatagoryFetch.FindStringSubmatch(line)[1]} cat.subCat = make([]*SubCatagory, 40, 100) */ cat = new(Catagory) cat.name = topCatagoryFetch.FindStringSubmatch(line)[1] cat.subCat = make([]*SubCatagory, 0, 100) Jd.cat = append(Jd.cat, cat) //fmt.Println("Catagory") //fmt.Println(cat) } } if sub == true { if detailCatagoryStart.MatchString(line) { /* fmt.Println(detailCatagoryStart.FindStringSubmatch(line)[1]) fmt.Println(detailCatagoryStart.FindStringSubmatch(line)[2]) subCat = &SubCatagory{name: detailCatagoryStart.FindStringSubmatch(line)[2], link: detailCatagoryStart.FindStringSubmatch(line)[1]} subCat.detailCat = make([]*DetailCatagory, 50, 100) */ subCat = new(SubCatagory) subCat.name = detailCatagoryStart.FindStringSubmatch(line)[2] subCat.link = detailCatagoryStart.FindStringSubmatch(line)[1] subCat.detailCat = make([]*DetailCatagory, 0, 100) cat.subCat = append(cat.subCat, subCat) //fmt.Println("SubCatagory") //fmt.Println(subCat) detail = true } } if detail == true { if detailCatagoryFetch.MatchString(line) { /* fmt.Println(detailCatagoryFetch.FindStringSubmatch(line)[1]) fmt.Println(detailCatagoryFetch.FindStringSubmatch(line)[2]) detailCat = &DetailCatagory{name: detailCatagoryFetch.FindStringSubmatch(line)[2], link: detailCatagoryFetch.FindStringSubmatch(line)[1]} */ detailCat = new(DetailCatagory) detailCat.name = detailCatagoryFetch.FindStringSubmatch(line)[2] detailCat.link = detailCatagoryFetch.FindStringSubmatch(line)[1] subCat.detailCat = append(subCat.detailCat, detailCat) //fmt.Println("DetailCatagory") // fmt.Println(detailCat) } if topCatagoryEnd.MatchString(line) { top = false sub = false detail = false } } } dumpJdCatagory(&Jd) //fmt.Println(s) //fmt.Printf("%d bytes has been write to jd_list.html", n) }

入群交流(该群和以上内容无关):Go中文网 QQ交流群:731990104 或 加微信入微信群:274768166 备注:入群; 公众号:Go语言中文网

2187 次点击  
加入收藏 微博
暂无回复
添加一条新回复 (您需要 登录 后才能回复 没有账号 ?)
  • 请尽量让自己的回复能够对别人有帮助
  • 支持 Markdown 格式, **粗体**、~~删除线~~、`单行代码`
  • 支持 @ 本站用户;支持表情(输入 : 提示),见 Emoji cheat sheet
  • 图片支持拖拽、截图粘贴等方式上传