使用golang抓取京东商品数据

pssmart · · 1767 次点击 · · 开始浏览    
这是一个创建于 的文章,其中的信息可能已经有所发展或是发生改变。

package main

import (
	"encoding/json"
	"fmt"
	"io/ioutil"
	"net/http"
	"os"
	//"reflect"
	"errors"
	"regexp"
	"strconv"
	"strings"
	//	"unicode/utf8"
	"database/sql"
	_ "github.com/go-sql-driver/mysql"
)

type JdProduct struct {
	skuid        string
	name         string
	skuidkey     string
	href         string
	src          string
	cat          []int64
	catName      []string
	brand        string
	pType        string
	venderId     string
	shopId       string
	specialAttrs string
	price        string
}

/*
type JdPrice struct {
	id string `json: "id"`
	p  string `json: "p"`
	m  string `json: "m"`
}
*/

const (
	offTheShelf = "SaleNo"
)

var db *sql.DB

var productSkuidFetch = regexp.MustCompile(`[[:space:]]*skuid:[[:space:]]+(?P<skuid>[0-9]+),`)
var productNameFetch = regexp.MustCompile(`[[:space:]]*name:[[:space:]]+\'(?P<name>.*)\',`)
var productSkuidkeyFetch = regexp.MustCompile(`[[:space:]]*skuidkey:[[:space:]]*\'(?P<skuidkey>.*)\',`)
var productHrefFetch = regexp.MustCompile(`[[:space:]]*href:[[:space:]]+\'(?P<href>.*)\',`)
var productSrcFetch = regexp.MustCompile(`[[:space:]]*src:[[:space:]]+\'(?P<src>.*)\',`)
var productCatFetch = regexp.MustCompile(`[[:space:]]*cat:[[:space:]]+\[(?P<cat>.*)\],`)
var productCatNameFetch = regexp.MustCompile(`[[:space:]]*catName:[[:space:]]+\[(?P<catname>.*)\],`)
var productBrandFetch = regexp.MustCompile(`[[:space:]]*brand:[[:space:]]+(?P<brand>[0-9]+),`)
var productPTypeFetch = regexp.MustCompile(`[[:space:]]*pType:[[:space:]]+(?P<ptype>[0-9]+),`)
var productVenderIdFetch = regexp.MustCompile(`[[:space:]]*venderId:[[:space:]]*(?P<venderId>[0-9]+),`)
var productShopIdFetch = regexp.MustCompile(`[[:space:]]*shopId:[[:space:]]*\'(?P<shopId>[0-9]+)\',`)
var productSpecialAttrsFetch = regexp.MustCompile(`[[:space:]]*specialAttrs:[[:space:]]*\[(?P<specialAttrs>.*)\],`)

func connectDB() (db *sql.DB, err error) {
	db, err = sql.Open("mysql", "root:leeweop@/jd")
	if err != nil {
		panic(err.Error())
		return nil, errors.New("Connect to db failed")
	}
	return db, nil
}

func createDatabase(db *sql.DB, name string) error {
	_, err := db.Exec("CREATE DATABASE IF NOT EXISTS jd DEFAULT CHARSET utf8 COLLATE utf8_general_ci")
	if err != nil {
		panic(err.Error())
		return errors.New("Create database failed")
	}
	return nil
}

func createTable(db *sql.DB, name string) error {
	command := "CREATE TABLE IF NOT EXISTS " + name + " (skuid BIGINT(64) NOT NULL PRIMARY KEY, name VARCHAR(256) NOT NULL, skuidkey VARCHAR(64) NOT NULL, href VARCHAR(128) NOT NULL, src VARCHAR(128) NOT NULL, cat1 INT(32) NOT NULL, cat2 INT(32) NOT NULL, cat3 INT(32) NOT NULL, brand VARCHAR(128) NOT NULL, pType INT(32) NOT NULL, venderId VARCHAR(64) NOT NULL, shopId VARCHAR(64) NOT NULL, specialAttrs VARCHAR(256) NULL, price DOUBLE NOT NULL)"
	fmt.Println(command)
	_, err := db.Exec(command)
	if err != nil {
		panic(err.Error())
		return errors.New("Create table failed")
	}
	return nil
}

func insertIntoDB(db *sql.DB, product *JdProduct) error {
	stmt, err := db.Prepare("INSERT INTO test (skuid, name, skuidkey, href, src, cat1, cat2, cat3, brand, pType, venderId, shopId, specialAttrs, price)values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
	defer stmt.Close()
	if err != nil {
		panic(err.Error())
		return errors.New("Command error")
	}
	_, err = stmt.Exec(product.skuid, product.name, product.skuidkey, product.href, product.src, product.cat[0], product.cat[1], product.cat[2], product.brand, product.pType, product.venderId, product.shopId, product.specialAttrs, product.price)
	if err != nil {
		panic(err.Error())
		return errors.New("Excute command error")
	}
	return nil

}

func dumpDatabase(db *sql.DB) {
	_, err := db.Exec("show databases")
	if err != nil {
		panic(err.Error())
	}

	var p JdProduct
	p.cat = make([]int64, 3)
	rows, err := db.Query("select * from test")
	for rows.Next() {
		rows.Scan(&p.skuid, &p.name, &p.skuidkey, &p.href, &p.src, &p.cat[0], &p.cat[1], &p.cat[2], &p.brand, &p.pType, &p.venderId, &p.shopId, &p.specialAttrs, &p.price)
	}
	fmt.Printf("%+v\n", p)
}

func flushDatabase(db *sql.DB) {
	_, _ = db.Exec("drop table test")
}

/*
func init() {
	db, err := connectDB()
	if err != nil {
		panic(err.Error())
	}
	flushDatabase(db)
	createDatabase(db, "jd")
	createTable(db, "test")
	//	dumpDatabase(db)
	//db.Close()
}
*/

func main() {
	//	for i := 260000; i < 2611111; i++ {
	//		u := fmt.Sprintf("http://item.jd.com/%d.html", i)
	//		fmt.Println(u)

	db, err := connectDB()
	if err != nil {
		panic(err.Error())
	}
	flushDatabase(db)
	createDatabase(db, "jd")
	createTable(db, "test")

	resp, err := http.Get("http://item.jd.com/2788767.html")
	//resp, err := http.Get(u)
	if err != nil {
		fmt.Println(err.Error())
	}

	defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		panic(err.Error())
	}

	var product JdProduct

	if productSkuidFetch.MatchString(string(body)) {
		fmt.Println(productSkuidFetch.SubexpNames()[1])
		fmt.Println(productSkuidFetch.FindStringSubmatch(string(body))[1])
		product.skuid = productSkuidFetch.FindStringSubmatch(string(body))[1]
	}

	if productNameFetch.MatchString(string(body)) {
		fmt.Println(productNameFetch.SubexpNames()[1])
		fmt.Println(productNameFetch.FindStringSubmatch(string(body))[1])
		product.name = productNameFetch.FindStringSubmatch(string(body))[1]

	}

	if productSkuidkeyFetch.MatchString(string(body)) {
		fmt.Println(productSkuidkeyFetch.SubexpNames()[1])
		fmt.Println(productSkuidkeyFetch.FindStringSubmatch(string(body))[1])
		product.skuidkey = productSkuidkeyFetch.FindStringSubmatch(string(body))[1]
	}

	if productHrefFetch.MatchString(string(body)) {
		fmt.Println(productHrefFetch.SubexpNames()[1])
		fmt.Println(productHrefFetch.FindStringSubmatch(string(body))[1])
		product.href = productHrefFetch.FindStringSubmatch(string(body))[1]
	}

	if productSrcFetch.MatchString(string(body)) {
		fmt.Println(productSrcFetch.SubexpNames()[1])
		fmt.Println(productSrcFetch.FindStringSubmatch(string(body))[1])
		product.src = productSrcFetch.FindStringSubmatch(string(body))[1]
	}

	if productCatFetch.MatchString(string(body)) {
		fmt.Println(productCatFetch.SubexpNames()[1])
		fmt.Println(productCatFetch.FindStringSubmatch(string(body))[1])
		//product.cat = productCatFetch.FindStringSubmatch(string(body))[1]
		s := strings.Split(productCatFetch.FindStringSubmatch(string(body))[1], ",")
		for _, val := range s {
			//			fmt.Println(val)
			v, _ := strconv.ParseInt(val, 10, 32)
			//			fmt.Printf("%+c\n", v)
			product.cat = append(product.cat, v)
		}
	}

	if productCatNameFetch.MatchString(string(body)) {
		fmt.Println(productCatNameFetch.SubexpNames()[1])
		fmt.Println(productCatNameFetch.FindStringSubmatch(string(body))[1])
		//product.catName = productCatNameFetch.FindStringSubmatch(string(body))[1]
		s := strings.Split(productCatNameFetch.FindStringSubmatch(string(body))[1], ",")
		for _, val := range s {
			product.catName = append(product.catName, val)
		}
	}

	if productBrandFetch.MatchString(string(body)) {
		fmt.Println(productBrandFetch.SubexpNames()[1])
		fmt.Println(productBrandFetch.FindStringSubmatch(string(body))[1])
		product.brand = productBrandFetch.FindStringSubmatch(string(body))[1]
	}

	if productPTypeFetch.MatchString(string(body)) {
		fmt.Println(productPTypeFetch.SubexpNames()[1])
		fmt.Println(productPTypeFetch.FindStringSubmatch(string(body))[1])
		product.pType = productPTypeFetch.FindStringSubmatch(string(body))[1]

	}

	if productVenderIdFetch.MatchString(string(body)) {
		fmt.Println(productVenderIdFetch.SubexpNames()[1])
		fmt.Println(productVenderIdFetch.FindStringSubmatch(string(body))[1])
		product.venderId = productVenderIdFetch.FindStringSubmatch(string(body))[1]
	}

	if productShopIdFetch.MatchString(string(body)) {
		fmt.Println(productShopIdFetch.SubexpNames()[1])
		fmt.Println(productShopIdFetch.FindStringSubmatch(string(body))[1])
		product.shopId = productShopIdFetch.FindStringSubmatch(string(body))[1]
	}

	if productSpecialAttrsFetch.MatchString(string(body)) {
		fmt.Println(productSpecialAttrsFetch.SubexpNames()[1])
		fmt.Println(productSpecialAttrsFetch.FindStringSubmatch(string(body))[1])
		product.specialAttrs = productSpecialAttrsFetch.FindStringSubmatch(string(body))[1]
		if strings.Contains(product.specialAttrs, offTheShelf) {
			//continue
			fmt.Println("")
		}
	}

	url := "http://p.3.cn/prices/mgets?skuIds=J_" + product.skuid + "&type=" + product.pType
	fmt.Println(url)

	r, err := http.Get(url)
	if err != nil {
		panic(err.Error())
	}

	p, err := ioutil.ReadAll(r.Body)
	if err != nil {
		panic(err.Error())
	}
	fmt.Println(string(p))

	m := make([]map[string]interface{}, 10)
	e := json.Unmarshal([]byte(p), &m)
	if e != nil {
		panic(e.Error())
	}

	if val, ok := m[0]["p"].(string); ok {
		product.price = val
	}

	var name string
	s := strings.Split(product.name, "\\u")
	for _, val := range s {
		//			fmt.Println(val)
		v, _ := strconv.ParseInt(val, 16, 32)
		//			fmt.Printf("%+c\n", v)
		name += fmt.Sprintf("%c", v)
	}

	fmt.Println(name)
	product.name = name
	//fmt.Printf("%+v\n", product)

	insertIntoDB(db, &product)
	dumpDatabase(db)
	file, err := os.Create("jd.html")
	if err != nil {
		panic(err.Error())
	}

	_, err = file.Write(body)
	if err != nil {
		panic(err.Error())
	}
	//	}

	//	fmt.Println(n, " byte has been write to jd.html")
}



有疑问加站长微信联系(非本文作者)

本文来自:CSDN博客

感谢作者:pssmart

查看原文:使用golang抓取京东商品数据

入群交流(和以上内容无关):加入Go大咖交流群,或添加微信:liuxiaoyan-s 备注:入群;或加QQ群:692541889

1767 次点击  
加入收藏 微博
暂无回复
添加一条新回复 (您需要 登录 后才能回复 没有账号 ?)
  • 请尽量让自己的回复能够对别人有帮助
  • 支持 Markdown 格式, **粗体**、~~删除线~~、`单行代码`
  • 支持 @ 本站用户;支持表情(输入 : 提示),见 Emoji cheat sheet
  • 图片支持拖拽、截图粘贴等方式上传