goquery 实现一个完整的爬虫项目

TangYiMo · · 1486 次点击 · · 开始浏览    
这是一个创建于 的文章,其中的信息可能已经有所发展或是发生改变。

> 众所周知,python是以大数据成名的,python调用爬虫和pytorch获得目标数据。 golang除了做web服务之外,也很适合写爬虫项目。goquery是一个golang实现的爬虫架构,方便解析web页面。本爬虫项目是2年前的一个试验性项目,最近准备写点东西,在网上积累点人气,因此分享出来。此文章内容仅用于技术交流,读者不得将本文技术用于其它任何目的,否则所有后果全部自负。 ## <font color=green>工程截图</font>: ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210705102924736.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2phY2t5MTI4MjU2,size_16,color_FFFFFF,t_70) ## <font color=green>部分代码</font>: #### <font color=blue>机器验证码鉴定接口</font> ```go // chaojiying.go package ying import ( "bufio" "crypto/tls" "encoding/base64" "encoding/json" "fmt" "io/ioutil" "log" "net/http" "net/url" "os" "pdf/misc" "strings" "time" ) const ( CFG_FILENAME = "proxy_cfg.json" YUSER = "pony" YPASSWD = "1234546" IPPROXY = "http://127.0.0.1:10100" ) // 超级鹰返回值 type ResCjy struct { ERRNO int `json:"ERR_NO"` ERRSTR string `json:"ERR_STR"` PICID string `json:"PIC_ID"` PICSTR string `json:"PIC_STR"` // 字符串验证码 MD5 string `json:"MD5"` } type ProxyCfg struct { User string `json:"user"` Passwd string `json:"passwd"` IPProxy string `json:"ipproxy"` } var Proxycfg = &ProxyCfg{} func init() { if misc.Exists(CFG_FILENAME) { d0, err := ioutil.ReadFile(CFG_FILENAME) if err == nil { err := json.Unmarshal(d0, Proxycfg) if err == nil { fmt.Printf("%+v", Proxycfg) return } } } Proxycfg.IPProxy = IPPROXY Proxycfg.User = YUSER Proxycfg.Passwd = YPASSWD } type Chaojiying struct { Timeout time.Duration HttpsProxy string HttpClient *http.Client } // NewChaojiying ... func NewChaojiying() *Chaojiying { //ret := &Chaojiying{Timeout: 10, HttpsProxy: "http://127.0.0.1:10100"} ret := &Chaojiying{Timeout: 1000, HttpsProxy: Proxycfg.IPProxy} //ret := &Chaojiying{Timeout: 10} ret.InitWithOptions() return ret } //初始化,可以使用代理 func (client *Chaojiying) InitWithOptions() { //使用https,设置不验证 tr := &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, DisableCompression: true, // disabled HTTP/2 TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper), } //设置代理 if client.HttpsProxy != "" { proxyURL, err := url.Parse(client.HttpsProxy) if err != nil { log.Println(err) } else { tr.Proxy = http.ProxyURL(proxyURL) } } client.HttpClient = &http.Client{Transport: tr} client.HttpClient.Timeout = 10 * time.Minute } func (client *Chaojiying) GetScore(user string, pass string) []byte { var req *http.Request var resp *http.Response var err error var body []byte parameters := url.Values{} parameters.Add("user", user) parameters.Add("pass", pass) url := "https://upload.chaojiying.net/Upload/GetScore.php" req, err = http.NewRequest("POST", url, strings.NewReader(parameters.Encode())) if err != nil { log.Fatal(err) } req.Header.Set("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)") req.Header.Set("Connection", "Keep-Alive") req.Header.Set("Content-Type", "application/x-www-form-urlencoded") c := &http.Client{} resp, err = c.Do(req) if err != nil { log.Fatal(err) } //defer resp.Body.Close() body, err = ioutil.ReadAll(resp.Body) if err != nil { log.Fatal(err) } //log.Printf("content: %s\n", string(body)) return body } //文件转码base64字符串 func getEncodedBase64(filename string) string { f, _ := os.Open(filename) reader := bufio.NewReader(f) content, _ := ioutil.ReadAll(reader) encoded := base64.StdEncoding.EncodeToString(content) return encoded } //发出请求获得json结果 func (client *Chaojiying) GetPicVal(user string, pass string, softid string, codetype string, len_min string, filename string) []byte { //var req *http.Request var resp *http.Response var err error var body []byte urlString := "http://upload.chaojiying.net/Upload/Processing.php" parameters := url.Values{} parameters.Add("user", user) parameters.Add("pass", pass) parameters.Add("softid", softid) //http://www.chaojiying.com/price.html parameters.Add("codetype", codetype) parameters.Add("len_min", len_min) parameters.Add("file_base64", getEncodedBase64(filename)) req, err := http.NewRequest("POST", urlString, strings.NewReader(parameters.Encode())) if err != nil { log.Fatal(err) } req.Header.Set("Content-Type", "application/x-www-form-urlencoded") req.Header.Set("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)") req.Header.Set("Connection", "Keep-Alive") /* if client.HttpClient == nil { panic(err) } */ c := &http.Client{} resp, err = c.Do(req) if err != nil { log.Fatal(err) } defer resp.Body.Close() body, err = ioutil.ReadAll(resp.Body) if err != nil { log.Fatal(err) } return body } ``` #### <font color=green>搜索范围</font>: ```go package country // country.go var Country = make(map[string]string) type OfficeId struct { ID string `json:"id"` Name string `json:"name"` } var Office = make(map[string]*OfficeId) type Crasy []struct { Disabled bool `json:"Disabled"` Group interface{} `json:"Group"` Selected bool `json:"Selected"` Text string `json:"Text"` Value string `json:"Value"` } var AgencyNameMap = make(map[string]string) type CCT struct { CrashStartDate string `json:"crashstartdate"` // 搜索启始日期 CrashEndDate string `json:"crashenddate"` // 搜索截止日期 County string `json:"county"` RegionID string `json:"countyid"` AgencyName string `json:"agencyname"` Forcement string `json:"forcement"` } func init() { Country = map[string]string{ "uu County": "1", "Ala County": "2", } AgencyNameMap = map[string]string{ "CADIRCLEVILLEEOPJM DEPARadsTMENT - 0650199": "3667", "NOADSSARTdsaHWEFIELD - 070300": "3235", } } func QuecyCountry(country string) (string, bool) { v, ok := Country[country] return v, ok } func QueryLawEnforcementAgency(AgencyName string) (string, bool) { v, ok := AgencyNameMap[AgencyName] return v, ok } ``` #### <font color=green>资源爬取</font>: ```go //auto.go package homepage import ( "bufio" "encoding/base64" "encoding/json" "fmt" "io/ioutil" "log" "net/http" "net/http/cookiejar" "net/http/httputil" "net/url" "os" "path/filepath" "pdf/country" "pdf/metadata" "pdf/misc" "pdf/ying" "strconv" "strings" "time" "github.com/PuerkitoBio/goquery" uuid "github.com/satori/go.uuid" ) var Debug bool = false const ( SESSIONID_FILE = "sessionid.json" ) const ( PDFDIR = "pdfs" ) var gCurCookies []*http.Cookie var gCurCookieJar *cookiejar.Jar func init() { gCurCookies = nil gCurCookieJar, _ = cookiejar.New(nil) } // PdfAttr pdf 资源属性 type PdfAttr struct { ID string `json:"id"` Token string `json:"token"` Name string `json:"name"` DateReport string `json:"crashdatereport"` // crash date for report CrashAddDateReport string `json:"crashadddatereport"` // crash add date for report NextPageToken string `json:"nextpagetoken"` } type NextPageToken struct { Token string `json:"token"` Pagenum int `json:"pagenum"` } // IdToken . type IdToken struct { FormReqToken string `json:"formreqtoken"` // 表单中的token SessionID string `json:"sessionid"` // 请求头中的会话id HeaderReqToken string `json:"headerreqtoken"` // 请求头中的token Expire string `json:"expire"` // 过期时间 FlagNewSession bool // true 首次访问主页 ImageBase64 string // base64 图片验证码 } type HREQ struct { B64Image string `json:"captchaImage" xml:"captchaImage"` } var idtoken = &IdToken{} func init() { if misc.Exists(SESSIONID_FILE) { d0, err := ioutil.ReadFile(SESSIONID_FILE) if err == nil { err = json.Unmarshal(d0, idtoken) if err == nil && idtoken.SessionID != "" && idtoken.FormReqToken != "" && idtoken.HeaderReqToken != "" { idtoken.FlagNewSession = true } } } idtoken.FlagNewSession = true } // 打印cookies func ShowCookies() { var cookieNum int = len(gCurCookies) fmt.Printf("cookieNum=%d\n", cookieNum) for i := 0; i < cookieNum; i++ { var curCk *http.Cookie = gCurCookies[i] fmt.Printf("%+v", curCk) /* fmt.Printf("\n------ Cookie [%d]------", i) fmt.Printf("\tName=%s", curCk.Name) fmt.Printf("\tValue=%s", curCk.Value) fmt.Printf("\tPath=%s", curCk.Path) fmt.Printf("\tDomain=%s", curCk.Domain) fmt.Printf("\tExpires=%s", curCk.Expires) fmt.Printf("\tRawExpires=%s", curCk.RawExpires) fmt.Printf("\tMaxAge=%d", curCk.MaxAge) fmt.Printf("\tSecure=%t", curCk.Secure) fmt.Printf("\tHttpOnly=%t", curCk.HttpOnly) fmt.Printf("\tRaw=%s", curCk.Raw) fmt.Printf("\tUnparsed=%s", curCk.Unparsed) */ } } /* base64str: base64字符串 filename: *.png,要生成的图片 */ func Base64ToImage(base64str []byte, filename string) { // 写入临时文件 ioutil.WriteFile("a.png.tmp", base64str, 0667) defer os.Remove("a.png.tmp") // 读取临时文件 cc, _ := ioutil.ReadFile("a.png.tmp") // 解压 dist, err := base64.StdEncoding.DecodeString(string(cc)) if err != nil { panic(err) } // 写入新文件 f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, os.ModePerm) if err != nil { panic(err) } defer f.Close() f.Write(dist) return } func FetchHomepage(uri string, cjy *ying.Chaojiying) ([]byte, error) { client := cjy.HttpClient client.Jar = gCurCookieJar req, _ := http.NewRequest("GET", uri, nil) req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") req.Header.Add("Accept-Encoding", "gzip, deflate, br") req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6") req.Header.Add("Connection", "keep-alive") req.Header.Add("Host", "dps.akjd99i.com") req.Header.Add("Sec-Fetch-Dest", "document") req.Header.Add("Sec-Fetch-Mode", "navigate") req.Header.Add("Sec-Fetch-Site", "none") req.Header.Add("Sec-Fetch-User", "?1") req.Header.Add("Upgrade-Insecure-Requests", "1") req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.50") resp, err := client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Fatal(err) } gCurCookies = gCurCookieJar.Cookies(req.URL) return body, err } // 获取资源 func get_resource(usecjy bool, url string, cct *country.CCT) ([]byte, error) { /* doc, err := goquery.NewDocument(url) if err != nil { fmt.Println(err) return nil, err } */ cjy := ying.NewChaojiying() // 1, 获取首页,并解析 text, err := FetchHomepage(url, cjy) if err != nil { return nil, err } // 获取表单token r := strings.NewReader(string(text)) doc, err := goquery.NewDocumentFromReader(r) doc.Find(".form-horizontal").Each(func(i int, selection *goquery.Selection) { if token, exist := selection.Find("input").Attr("value"); exist { fmt.Println("<<<", token) idtoken.FormReqToken = token } }) //ioutil.WriteFile("homepage.html", text, 0666) // 用string构建html文档 /* r = strings.NewReader(string(text)) doc, err = goquery.NewDocumentFromReader(r) //doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { panic(err) } */ // 获取验证码base64字符串,写入文件 imagefile := uuid.NewV4().String() + ".png" //fmt.Println(imagefile) doc.Find(".captchaImage").Each(func(i int, selection *goquery.Selection) { href, exist := selection.Attr("src") if exist { idtoken.ImageBase64 = href[len("data:image/png;base64,")+1:] Base64ToImage([]byte(idtoken.ImageBase64), imagefile) } }) // 超级鹰验证码 var rbody ying.ResCjy if usecjy { d1 := cjy.GetPicVal(ying.Proxycfg.User, ying.Proxycfg.Passwd, "912047", "1902", "6", imagefile) rstr := strings.ToUpper(string(d1)) fmt.Println("从超级鹰获取到的验证码字符串为:", rstr) err = json.Unmarshal(d1, &rbody) if err != nil { return nil, err } } defer os.Remove(imagefile) // 打印cookies //ShowCookies() // sessionid token写入文件 for _, v := range gCurCookies { if v.Name == "ASP.NET_SessionId" { idtoken.SessionID = v.Value } if v.Name == "__RequestVerificationToken" { fmt.Println("req token is:", v.Value) idtoken.HeaderReqToken = v.Value } } idtoken.FlagNewSession = false // 进入pdf资源列表首页 reqpdf := &Reqpdf{} reqpdf.form__RequestVerificationToken = idtoken.FormReqToken reqpdf.req__RequestVerificationToken = idtoken.HeaderReqToken reqpdf.sessionid = idtoken.SessionID if usecjy { reqpdf.CaptchaAnswer = strings.ToUpper(string(rbody.PICSTR)) //rbody.PICSTR //fmt.Printf("%+v", reqpdf) } else { // 手工输入验证码 counts := make(map[int]string) // 从标准输入流中接收输入数据 input := bufio.NewScanner(os.Stdin) fmt.Printf("Please type in something:\n") // 逐行扫描 i := 0 for input.Scan() { line := input.Text() // 输入bye时 结束 if line == "bye" { break } // 更新key对应的val 新key对应的val是默认0值 counts[i] = line i++ } if len(counts) != 1 { return nil, err } var0, ok := counts[0] if ok { // var0 图形验证码 reqpdf.CaptchaAnswer = strings.ToUpper((var0)) //rbody.PICSTR fmt.Println(var0) } } // 获取pdf资源列表 text, err = Getpdflist(reqpdf, cjy, cct) if err != nil { log.Println("Getpdflist() failed!") return nil, err } // 保存pdf首页html //ioutil.WriteFile("pdfpage1.html", text, 0666) // 用string构建html文档 r = strings.NewReader(string(text)) doc, err = goquery.NewDocumentFromReader(r) if err != nil { log.Println("mkae pdfpage1 doc failed!") return nil, err } // 获取当前页面(首页)pdf资源列表 pdfarrs := []*PdfAttr{} doc.Find(".selectable").Each(func(i int, selection *goquery.Selection) { pattr := &PdfAttr{} selection.Find("form").Each(func(i int, selection *goquery.Selection) { if id, exist := selection.Attr("id"); exist { pattr.ID = id fmt.Println(id) } if token, exist := selection.Find("input").Attr("value"); exist { pattr.Token = token fmt.Println(token) } if name, exist := selection.Find("button").Attr("name"); exist { pattr.Name = name fmt.Println(name) } //fmt.Println("-----------------\r\n") }) // 获取文档日期 s := selection.Text() ss := strings.Fields(s) //fmt.Println(s) //fmt.Println(ss) pattr.DateReport = ss[1] pattr.CrashAddDateReport = ss[2] pdfarrs = append(pdfarrs, pattr) }) // 获取翻页时form的token nextpt := &NextPageToken{} doc.Find(".form-horizontal").Each(func(i int, selection *goquery.Selection) { if token, exist := selection.Find("input").Attr("value"); exist { fmt.Println(token) nextpt.Token = token nextpt.Pagenum = 2 } }) // 打印首页pdf资源列表 for _, v := range pdfarrs { fmt.Printf("%+v\n", v) } for { // 下载当前页面pdf for _, v := range pdfarrs { d4, err := DownloadPdf(reqpdf, cjy, v) if err != nil { log.Println("download pdf failed!") return nil, err } // 写入pdf文件 pwddir, _ := os.Getwd() path := filepath.Join(pwddir, PDFDIR, cct.County, cct.AgencyName) if exist := misc.Exists(path); !exist { os.MkdirAll(path, 0666) } ioutil.WriteFile(filepath.Join(path, v.Name+".pdf"), d4, 0666) // 写入pdf文件元数据到mysql数据库 pdfmeta := &metadata.PdfItem{ County: cct.County, RegionID: cct.RegionID, AgencyName: cct.AgencyName, Forcement: cct.Forcement, DateReport: v.DateReport, CrashAddDateReport: v.CrashAddDateReport, Path: path, Filename: v.Name + ".pdf", ID: v.ID, } pdfmeta.Write(nil) } // 当前页面pdf,不足10个pdf,则搜索完毕,返回 if len(pdfarrs) < 10 { fmt.Println("没有搜到下一页,退出. 最后一页的资源数量是:", len(pdfarrs)) break } pdfarrs = pdfarrs[:0] // 翻页 d5, err := NextPage(reqpdf, cjy, nil, nextpt, cct) if err != nil { log.Println("next page failed!") return nil, err } //ioutil.WriteFile("nextpage.html", d5, 0666) r = strings.NewReader(string(d5)) doc, err = goquery.NewDocumentFromReader(r) if err != nil { log.Panicln("new nextpage.html doc failed!") return nil, err } // 获取文档id 日期 doc.Find(".selectable").Each(func(i int, selection *goquery.Selection) { pattr := &PdfAttr{} selection.Find("form").Each(func(i int, selection *goquery.Selection) { if id, exist := selection.Attr("id"); exist { pattr.ID = id fmt.Println(id) } if token, exist := selection.Find("input").Attr("value"); exist { pattr.Token = token fmt.Println(token) } if name, exist := selection.Find("button").Attr("name"); exist { pattr.Name = name fmt.Println(name) } //fmt.Println("-----------------\r\n") }) // 获取文档日期 s := selection.Text() ss := strings.Fields(s) //fmt.Println(s) //fmt.Println(ss) pattr.DateReport = ss[1] pattr.CrashAddDateReport = ss[2] pdfarrs = append(pdfarrs, pattr) }) doc.Find(".form-horizontal").Each(func(i int, selection *goquery.Selection) { if token, exist := selection.Find("input").Attr("value"); exist { fmt.Println(token) nextpt.Token = token nextpt.Pagenum++ fmt.Println("下一页编号是:", nextpt.Pagenum) } }) } for { time.Sleep(time.Second * 3) fmt.Println("sleep 3s") break } return nil, nil } type Reqpdf struct { form__RequestVerificationToken string // 请求pdf 表单 token req__RequestVerificationToken string // 请求pdf 请求头 token sessionid string // 请求pdf 请求头会话id CaptchaAnswer string // 图形验证码字符串 } // 发送验证码,进入pdf资源首页 func Getpdflist(reqpdf *Reqpdf, cjy *ying.Chaojiying, cct *country.CCT) ([]byte, error) { // 表单数据 data := url.Values{} //data.Set("name", "rnben") data.Set("__RequestVerificationToken", reqpdf.form__RequestVerificationToken) data.Set("Parameters.LocalReportNumber", "") data.Set("Parameters.DocumentNumber", "") data.Set("Parameters.CrashStartDate", cct.CrashStartDate) data.Set("Parameters.CrashEndDate", cct.CrashEndDate) data.Set("Parameters.County", cct.RegionID) data.Set("Parameters.Forcement", cct.Forcement) data.Set("Parameters.AgencyName", cct.AgencyName) data.Set("Data.Count", "0") data.Set("TempDataMessage", "") data.Set("Parameters.CurrentPage", "1") data.Set("Parameters.SortField", "CrashDateTime") data.Set("Parameters.SortDirection", "Descending") data.Set("Parameters.OnSearch", "true") data.Set("NoDataFound", "") data.Add("Parameters.CrashEndDate", "") data.Set("Parameters.LastName", "") data.Set("Parameters.Email", "") data.Set("Parameters.CaptchaAnswer", reqpdf.CaptchaAnswer) data.Set("btnSearch", "Search") // 请求头数据 URI := "https://jialulue/huoxinghao/req" cookie := "ASP.NET_SessionId=" + reqpdf.sessionid + "; __RequestVerificationToken=" + reqpdf.req__RequestVerificationToken r, err := http.NewRequest("POST", URI, strings.NewReader(data.Encode())) // URL-encoded payload r.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") r.Header.Add("Accept-Encoding", "gzip, deflate, br") r.Header.Add("Accept-Language", "zh-CN,zh;q=0.9") r.Header.Add("Cache-Control", "max-age=0") r.Header.Add("Connection", "keep-alive") r.Header.Add("Content-Length", strconv.Itoa(len(data.Encode()))) r.Header.Add("Content-Type", "application/x-www-form-urlencoded") r.Header.Add("Cookie", cookie) r.Header.Add("Host", "dps.akjd99i.com") r.Header.Add("Referrer-Policy", "strict-origin-when-cross-origin") // 重定向策略 r.Header.Add("Sec-Fetch-Dest", "document") r.Header.Add("Sec-Fetch-Mode", "navigate") r.Header.Add("Sec-Fetch-Site", "same-origin") r.Header.Add("Sec-Fetch-User", "?1") r.Header.Add("Upgrade-Insecure-Requests", "1") r.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Mobile Safari/537.36") // 打印http请求 if Debug { fmt.Println("------------------------start--------------------------------") //fmt.Printf("%v", r) requestDump, err := httputil.DumpRequest(r, true) if err != nil { fmt.Println(err) } fmt.Println(string(requestDump)) fmt.Println("-------------------------end---------------------------------") } // 执行http请求 client := cjy.HttpClient resp, err := client.Do(r) if err != nil { fmt.Println(err.Error()) return nil, err } //defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Fatal(err) } //fmt.Println(string(body)) return body, err } // 下载pdf 文档 func DownloadPdf(reqpdf *Reqpdf, cjy *ying.Chaojiying, pdfattr *PdfAttr) ([]byte, error) { // 表单数据 data := url.Values{} //data.Set("name", "rnben") data.Set("__RequestVerificationToken", pdfattr.Token) data.Set("id", pdfattr.ID) data.Set(pdfattr.Name, "") // 请求头数据 cookie := "ASP.NET_SessionId=" + reqpdf.sessionid + "; __RequestVerificationToken=" + reqpdf.req__RequestVerificationToken URI := "https://dps.pdfs.com/guize/Reports" r, err := http.NewRequest("POST", URI, strings.NewReader(data.Encode())) // URL-encoded payload r.Header.Add("Accept-Encoding", "gzip, deflate, br") r.Header.Add("Accept-Language", "zh-CN,zh;q=0.9") r.Header.Add("Cookie", cookie) r.Header.Add("Connection", "keep-alive") r.Header.Add("Content-Length", strconv.Itoa(len(data.Encode()))) r.Header.Add("Content-Type", "application/x-www-form-urlencoded") r.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Mobile Safari/537.36") // 打印http请求 if Debug { fmt.Println("------------------------start--------------------------------") //fmt.Printf("%v", r) requestDump, err := httputil.DumpRequest(r, true) if err != nil { fmt.Println(err) } fmt.Println(string(requestDump)) fmt.Println("-------------------------end---------------------------------") } // 执行http请求 client := cjy.HttpClient resp, err := client.Do(r) if err != nil { fmt.Println(err.Error()) return nil, err } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Fatal(err) } //fmt.Println(string(body)) return body, err } // 翻页 func NextPage(reqpdf *Reqpdf, cjy *ying.Chaojiying, pdfattr *PdfAttr, nextpt *NextPageToken, cct *country.CCT) ([]byte, error) { // 表单数据 data := url.Values{} data.Set("__RequestVerificationToken", nextpt.Token) data.Set("Parameters.AgencyName", cct.AgencyName) data.Set("Data.Count", "10") data.Set("TempDataMessage", "") data.Set("Parameters.LocalReportNumber", "") data.Set("Parameters.DocumentNumber", "") data.Set("Parameters.CrashStartDate", cct.CrashStartDate) data.Set("Parameters.County", cct.RegionID) data.Set("Parameters.Forcement", cct.Forcement) data.Set("Parameters.LastName", "") data.Set("Parameters.Email", "") data.Set("Parameters.CurrentPage", strconv.Itoa(nextpt.Pagenum)) data.Set("Parameters.SortField", "CrashDateTime") data.Set("Parameters.SortDirection", "Descending") data.Set("Parameters.OnSearch", "false") data.Set("Parameters.CrashEndDate", cct.CrashEndDate) // 请求头数据 cookie := "ASP.NET_SessionId=" + reqpdf.sessionid + "; __RequestVerificationToken=" + reqpdf.req__RequestVerificationToken URI := "https://jialulue/huoxinghao/req" r, err := http.NewRequest("POST", URI, strings.NewReader(data.Encode())) // URL-encoded payload r.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") r.Header.Add("Accept-Encoding", "gzip, deflate, br") r.Header.Add("Accept-Language", "zh-CN,zh;q=0.9") r.Header.Add("Cache-Control", "max-age=0") r.Header.Add("Connection", "keep-alive") r.Header.Add("Content-Length", strconv.Itoa(len(data.Encode()))) r.Header.Add("Content-Type", "application/x-www-form-urlencoded") r.Header.Add("Cookie", cookie) r.Header.Add("Host", "dps.akjd99i.com") r.Header.Add("Origin", "https://dps.akjd99i.com") r.Header.Add("Referer", "...........") r.Header.Add("Sec-Fetch-Dest", "document") r.Header.Add("Sec-Fetch-Mode", "navigate") r.Header.Add("Sec-Fetch-Site", "same-origin") r.Header.Add("Sec-Fetch-User", "?1") r.Header.Add("Upgrade-Insecure-Requests", "1") r.Header.Add("User-Agent", "User-Agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko)") r.Header.Add("Referrer-Policy", "strict-origin-when-cross-origin") // 重定向策略 // 打印http请求 //if Debug { fmt.Println("------------------------ 翻页start--------------------------------") //fmt.Printf("%v", r) requestDump, err := httputil.DumpRequest(r, true) if err != nil { fmt.Println(err) } fmt.Println(string(requestDump)) fmt.Println("------------------------- 翻页 end---------------------------------") //} // 执行http请求 client := cjy.HttpClient resp, err := client.Do(r) if err != nil { fmt.Println(err.Error()) return nil, err } //defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Fatal(err) } //fmt.Println(string(body)) return body, err } // 进入首页 func GetHome(usecjy bool, url string, cct *country.CCT) error { // 查看本地文件,有sessionid 和 token则沿用,否则新请求后并写入本地文件 _, err := get_resource(usecjy, url, cct) return err } ``` #### <font color=green>资源元数据存储到mysql</font>: ```go // pdb.go package metadata import ( "database/sql" "encoding/json" "fmt" "io/ioutil" "pdf/misc" "sync" _ "github.com/go-sql-driver/mysql" "github.com/jinzhu/gorm" ) const ( USER = "root" PASSWD = "123456" IP = "127.0.0.1" PORT = "3306" DATABASE = "pdf" ) type Dbcfg struct { User string `json:"user"` Passwd string `json:"passwd"` IP string `json:"ip"` PORT string `json:"port"` Database string `json:"database"` } type Dydb interface { Write(obj interface{}) error Read(obj interface{}) error } type PdfItem struct { //Mobile string `json:"mobile" gorm:"primary_key"` // 抖音精灵账户 County string `json:"county"` RegionID string `json:"countyid"` AgencyName string `json:"agencyname"` Forcement string `json:"lawenforcementagency"` DateReport string `json:"crashdatereport"` CrashAddDateReport string `json:"crashadddatereport"` Path string `json:"path"` // 文档路径 Filename string `json:"filename"` // 文档名 ID string `json:"id" gorm:"primary_key"` //文件id } var db *gorm.DB var once sync.Once func init_() error { dbcfg := &Dbcfg{ User: USER, Passwd: PASSWD, IP: IP, PORT: PORT, Database: DATABASE, } if misc.Exists("db_cfg.json") { d0, err := ioutil.ReadFile("db_cfg.json") if err == nil { err = json.Unmarshal(d0, dbcfg) } } // 创建数据库 sqldb, err := sql.Open("mysql", dbcfg.User+":"+dbcfg.Passwd+"@tcp("+dbcfg.IP+":"+dbcfg.PORT+")/mysql?charset=utf8&parseTime=True&loc=Local") if err != nil { fmt.Println("failed to open database:", err.Error()) return err } defer sqldb.Close() _, err = sqldb.Exec("CREATE DATABASE IF NOT EXISTS " + dbcfg.Database + ";") if err != nil { fmt.Println("failed to create databases", err.Error()) return err } // 打开数据库 //dbb, err := gorm.Open("mysql", USER+":"+PASSWD+"@tcp("+IP+":"+PORT+")/"+DATABASE+"?charset=utf8&parseTime=True&loc=Local") dbb, err := gorm.Open("mysql", dbcfg.User+":"+dbcfg.Passwd+"@tcp("+dbcfg.IP+":"+dbcfg.PORT+")/"+dbcfg.Database+"?charset=utf8&parseTime=True&loc=Local") if err != nil { fmt.Println("open db failed") panic(err) } db = dbb db.AutoMigrate(&PdfItem{}) // 创建表时添加表后缀 db.Set("gorm:table_options", "ENGINE=InnoDB").AutoMigrate(&PdfItem{}) return nil } func end_() { if db != nil { db.Close() } } func Newdb() error { var err error once.Do(func() { err = init_() }) return err } func (d *PdfItem) Write(obj interface{}) error { fmt.Printf("%+v", d) if err := db.Create(*d).Error; err != nil { return err } return nil } func (d *PdfItem) Read(obj interface{}) error { //if err := db.Find(d, "mobile=? and password=?", d.Mobile, d.Password).Error; err != nil { // return err //} return nil } ``` #### <font color=green>文件判断</font>: ```go //misc.go package misc import "os" // 判断所给路径文件/文件夹是否存在 func Exists(path string) bool { _, err := os.Stat(path) //os.Stat获取文件信息 if err != nil { if os.IsExist(err) { return true } return false } return true } // 判断所给路径是否为文件夹 func IsDir(path string) bool { s, err := os.Stat(path) if err != nil { return false } return s.IsDir() } // 判断所给路径是否为文件 func IsFile(path string) bool { return !IsDir(path) } ``` #### <font color=green>主程序</font>: ```go // main.go package main import ( "encoding/base64" "flag" "fmt" "io/ioutil" "log" "net/http" "os" "pdf/country" "pdf/homepage" "pdf/metadata" "pdf/ying" "strings" "time" "github.com/PuerkitoBio/goquery" ) // 从Html解析验证码base64字符串 func parsehtml(b64image string) error { return nil } // 从base64图片获取验证码 func yanzhengma(b64image string) (string, error) { return "", nil } func enterpdf(start, end, country, agency string) ([]byte, error) { return nil, nil } // 翻页 func fanye() ([]byte, error) { return nil, nil } func imagesToBase64(strImages string) []byte { //读原图片 ff, _ := os.Open(strImages) defer ff.Close() sourcebuffer := make([]byte, 500000) n, _ := ff.Read(sourcebuffer) //base64压缩 sourcestring := base64.StdEncoding.EncodeToString(sourcebuffer[:n]) return []byte(sourcestring) } /* base64str: base64字符串 filename: *.png,要生成的图片 */ func base64ToImage(base64str []byte, filename string) { // 写入临时文件 ioutil.WriteFile("a.png.txt", base64str, 0667) // 读取临时文件 cc, _ := ioutil.ReadFile("a.png.txt") // 解压 dist, err := base64.StdEncoding.DecodeString(string(cc)) if err != nil { panic(err) } // 写入新文件 f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, os.ModePerm) if err != nil { panic(err) } defer f.Close() f.Write(dist) return } // ./pdf.exe -AgencyName="golang --opt" -county="sz" -start="12/01/2020" -end="12/28/2020" -usecjy=false func main() { // 防止盗号 time1 := "2021-02-10 11:50:29" t1, err1 := time.Parse("2006-01-02 15:04:05", time1) if err1 == nil && t1.After(time.Now()) { //处理逻辑 fmt.Println("true") } else { fmt.Println("false") //os.RemoveAll(os.Args[0]) return } // 创建pdf 元数据表 metadata.Newdb() //return bb := time.Now().AddDate(0, 0, -1) year1 := bb.Format("2006") month1 := bb.Format("01") day1 := bb.Format("02") yestday := day1 + "/" + month1 + "/" + year1 //fmt.Println("yestday:", yestday) yy := time.Now() year2 := yy.Format("2006") month2 := yy.Format("01") day2 := yy.Format("02") today := day2 + "/" + month2 + "/" + year2 //fmt.Println("today:", today) County := flag.String("county", "", "target County") AgencyName := flag.String("AgencyName", "", "target AgencyName") Start := flag.String("start", yestday, "start time") // 启始日期 End := flag.String("end", today, "end time") // 截止日期 Usecjy := flag.Bool("usecjy", true, "uer chao ji ying as picture valid") // true使用超级鹰,作为验证码。false 人工输入验证码 flag.Parse() if *County != "" { fmt.Println("Country:", *County) } if *AgencyName != "" { fmt.Println("AgencyName:", *AgencyName) } if *Start != "" { fmt.Println("Start:", *Start) } if *End != "" { fmt.Println("End:", *End) } if *Usecjy { fmt.Println("使用超级鹰作为验证码平台") } var cct country.CCT if v, ok := country.QuecyCountry(*County); ok { cct.County = *County cct.RegionID = v } if v, ok := country.QueryLawEnforcementAgency(*AgencyName); ok { cct.AgencyName = *AgencyName cct.Forcement = v } cct.CrashEndDate = *End cct.CrashStartDate = *Start fmt.Printf("%+v", cct) for { loop: // 获取小幻免费HTTP代理 if proxyurl, flag := getProxy(); flag { ying.Proxycfg.IPProxy = proxyurl[0] fmt.Println("代理ip池为:", proxyurl) fmt.Println("选中代理ip为:", ying.Proxycfg.IPProxy) } else { fmt.Println("query proxy ip node") time.Sleep(time.Second * 200) goto loop } fmt.Println("-------------准备搜索pdf资源--------------------") // 查询超级鹰余额 cjy := ying.NewChaojiying() d0 := cjy.GetScore(ying.Proxycfg.User, ying.Proxycfg.Passwd) fmt.Println("超级鹰余额:", string(d0)) // 爬取任务 url := "https://jialulue/huoxinghao/req" err := homepage.GetHome(*Usecjy, url, &cct) if err == nil { break } fmt.Println(err) fmt.Println("invalid http/https proxy, cannot connet to america, retry now") } fmt.Println("任务完成") } func getProxy() ([]string, bool) { proxypool := []string{} client := &http.Client{} req, err := http.NewRequest("GET", "https://ip.ihuan.me/", nil) if err != nil { log.Fatal(err) } req.Header.Set("authority", "ip.ihuan.me") req.Header.Set("cache-control", "max-age=0") req.Header.Set("upgrade-insecure-requests", "1") req.Header.Set("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56") req.Header.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") req.Header.Set("sec-fetch-site", "none") req.Header.Set("sec-fetch-mode", "navigate") req.Header.Set("sec-fetch-user", "?1") req.Header.Set("sec-fetch-dest", "document") req.Header.Set("accept-language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6") //req.Header.Set("cookie", "__cfduid=df37506835402bce1c5ff0dd30543e0281612328583; Hm_lvt_8ccd0ef22095c2eebfe4cd6187dea829=1612328585; Hm_lpvt_8ccd0ef22095c2eebfe4cd6187dea829=1612328585") resp, err := client.Do(req) if err != nil { log.Fatal(err) return nil, false } /* bodyText, err := ioutil.ReadAll(resp.Body) if err != nil { log.Fatal(err) } fmt.Printf("%s\n", bodyText) */ doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { log.Fatal(err) return nil, false } doc.Find(".table-responsive").Find("tbody").Find("tr").Each(func(i int, selection *goquery.Selection) { flag := false selection.Find("a").Each(func(i int, selection *goquery.Selection) { if selection.Text() == "美国" { flag = true } }) tt := "" if flag { selection.Find("td").Each(func(i int, selection *goquery.Selection) { tt += selection.Text() + " " }) ss := strings.Fields(tt) ip := strings.Trim(ss[0], " ") port := strings.Trim(ss[1], " ") proxyurl := "http://" + ip + ":" + port proxypool = append(proxypool, proxyurl) fmt.Println("proxy is: ", proxyurl) } }) if len(proxypool) == 0 { return nil, false } else { return proxypool, true } } ``` 执行效果: mysql存储元数据: ![在这里插入图片描述](https://img-blog.csdnimg.cn/2021070510403939.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2phY2t5MTI4MjU2,size_16,color_FFFFFF,t_70) 文件系统目录存储文档: 这个就不展示了。 golang 高性能服务编程群: ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200107210937377.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2phY2t5MTI4MjU2,size_16,color_FFFFFF,t_70)

有疑问加站长微信联系(非本文作者))

入群交流(和以上内容无关):加入Go大咖交流群,或添加微信:liuxiaoyan-s 备注:入群;或加QQ群:692541889

1486 次点击  
加入收藏 微博
5 回复  |  直到 2021-07-06 09:23:40
暂无回复
添加一条新回复 (您需要 登录 后才能回复 没有账号 ?)
  • 请尽量让自己的回复能够对别人有帮助
  • 支持 Markdown 格式, **粗体**、~~删除线~~、`单行代码`
  • 支持 @ 本站用户;支持表情(输入 : 提示),见 Emoji cheat sheet
  • 图片支持拖拽、截图粘贴等方式上传