使用golang抓取京东全部商品分类信息

pssmart · · 1790 次点击 · · 开始浏览    
这是一个创建于 的文章,其中的信息可能已经有所发展或是发生改变。

package main

import (
	//	"errors"
	"fmt"
	"io/ioutil"
	"net/http"
	"os"
	"regexp"
	"strings"
)

type Mall struct {
	name string
	cat  []*Catagory
}

type Catagory struct {
	id     int64
	name   string
	link   string
	subCat []*SubCatagory
}

type SubCatagory struct {
	id        int64
	name      string
	link      string
	detailCat []*DetailCatagory
}

type DetailCatagory struct {
	id    int64
	name  string
	link  string
	goods map[string]interface{}
}

var Jd = Mall{name: "Jd"}

var topCatagoryStart = regexp.MustCompile(`[[:space:]]*\<div[[:space:]]+class="category-item m"\>`)
var topCatagoryFetch = regexp.MustCompile(`[[:space:]]*.*\<span\>(?P<topC>.*)\</span\>`)
var topCatagoryEnd = regexp.MustCompile(`[[:space:]]*\</div\>`)
var subCatagoryFetch = regexp.MustCompile(``)

var detailCatagoryStart = regexp.MustCompile(`[[:space:]]*\<dt\>\<a[[:space:]]+href=\"//(?P<cat2link>.*)\"[[:space:]]+target="_blank"\>(?P<cat2name>[\p{Han}]+)\</a\>\</dt\>`)

var detailCatagoryFetch = regexp.MustCompile(`[[:space:]]*.*\<a[[:space:]]+href=\"//(?P<cat2link>.*)\"[[:space:]]+target="_blank"\>(?P<cat2name>[\p{Han}]+)\</a\>`)

func dumpJdCatagory(mall *Mall) {
	fmt.Println(mall.name)
	for _, c := range Jd.cat {
		fmt.Println(c)
		fmt.Printf("Catagory: %s\n", c.name)
		for _, sc := range c.subCat {
			fmt.Printf("SubCatagory: %s, Link: %s\n", sc.name, sc.link)
			for _, dc := range sc.detailCat {
				fmt.Printf("DetailCatagory: %s. Link: %s\n", dc.name, dc.link)
			}
		}
	}
}

func main() {
	resp, err := http.Get("http://www.jd.com/allSort.aspx")
	if err != nil {
		panic(err.Error())
	}

	defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		panic(err.Error())
	}
	//	fmt.Println(string(body))

	file, err := os.Create("jd_list.html")
	if err != nil {
		panic(err.Error())
	}

	_, err = file.Write(body)
	if err != nil {
		panic(err.Error())
	}

	Jd.cat = make([]*Catagory, 0, 100)
	var top = false
	var sub = false
	var detail = false
	var cat *Catagory
	var subCat *SubCatagory
	var detailCat *DetailCatagory
	s := strings.Split(string(body), "\n")
	for _, line := range s {
		if topCatagoryStart.MatchString(line) {
			top = true
			sub = false
			detail = false
		}
		if top == true {
			if topCatagoryFetch.MatchString(line) {
				sub = true
				/*
					fmt.Println(topCatagoryFetch.FindStringSubmatch(line)[1])
						cat = &Catagory{name: topCatagoryFetch.FindStringSubmatch(line)[1]}
						cat.subCat = make([]*SubCatagory, 40, 100)
				*/
				cat = new(Catagory)
				cat.name = topCatagoryFetch.FindStringSubmatch(line)[1]
				cat.subCat = make([]*SubCatagory, 0, 100)
				Jd.cat = append(Jd.cat, cat)
				//fmt.Println("Catagory")
				//fmt.Println(cat)
			}
		}

		if sub == true {
			if detailCatagoryStart.MatchString(line) {
				/*
					fmt.Println(detailCatagoryStart.FindStringSubmatch(line)[1])
					fmt.Println(detailCatagoryStart.FindStringSubmatch(line)[2])
						subCat = &SubCatagory{name: detailCatagoryStart.FindStringSubmatch(line)[2], link: detailCatagoryStart.FindStringSubmatch(line)[1]}
						subCat.detailCat = make([]*DetailCatagory, 50, 100)
				*/
				subCat = new(SubCatagory)
				subCat.name = detailCatagoryStart.FindStringSubmatch(line)[2]
				subCat.link = detailCatagoryStart.FindStringSubmatch(line)[1]
				subCat.detailCat = make([]*DetailCatagory, 0, 100)
				cat.subCat = append(cat.subCat, subCat)
				//fmt.Println("SubCatagory")
				//fmt.Println(subCat)
				detail = true
			}
		}

		if detail == true {
			if detailCatagoryFetch.MatchString(line) {
				/*
					fmt.Println(detailCatagoryFetch.FindStringSubmatch(line)[1])
					fmt.Println(detailCatagoryFetch.FindStringSubmatch(line)[2])
						detailCat = &DetailCatagory{name: detailCatagoryFetch.FindStringSubmatch(line)[2], link: detailCatagoryFetch.FindStringSubmatch(line)[1]}
				*/
				detailCat = new(DetailCatagory)
				detailCat.name = detailCatagoryFetch.FindStringSubmatch(line)[2]
				detailCat.link = detailCatagoryFetch.FindStringSubmatch(line)[1]
				subCat.detailCat = append(subCat.detailCat, detailCat)
				//fmt.Println("DetailCatagory")
				//				fmt.Println(detailCat)

			}
			if topCatagoryEnd.MatchString(line) {
				top = false
				sub = false
				detail = false
			}
		}
	}

	dumpJdCatagory(&Jd)
	//fmt.Println(s)
	//fmt.Printf("%d bytes has been write to jd_list.html", n)

}


有疑问加站长微信联系(非本文作者)

本文来自:CSDN博客

感谢作者:pssmart

查看原文:使用golang抓取京东全部商品分类信息

入群交流(和以上内容无关):加入Go大咖交流群,或添加微信:liuxiaoyan-s 备注:入群;或加QQ群:692541889

1790 次点击  
加入收藏 微博
暂无回复
添加一条新回复 (您需要 登录 后才能回复 没有账号 ?)
  • 请尽量让自己的回复能够对别人有帮助
  • 支持 Markdown 格式, **粗体**、~~删除线~~、`单行代码`
  • 支持 @ 本站用户;支持表情(输入 : 提示),见 Emoji cheat sheet
  • 图片支持拖拽、截图粘贴等方式上传