> 众所周知,python是以大数据成名的,python调用爬虫和pytorch获得目标数据。 golang除了做web服务之外,也很适合写爬虫项目。goquery是一个golang实现的爬虫架构,方便解析web页面。本爬虫项目是2年前的一个试验性项目,最近准备写点东西,在网上积累点人气,因此分享出来。此文章内容仅用于技术交流,读者不得将本文技术用于其它任何目的,否则所有后果全部自负。
## <font color=green>工程截图</font>:
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210705102924736.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2phY2t5MTI4MjU2,size_16,color_FFFFFF,t_70)
## <font color=green>部分代码</font>:
#### <font color=blue>机器验证码鉴定接口</font>
```go
// chaojiying.go
package ying
import (
"bufio"
"crypto/tls"
"encoding/base64"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
"pdf/misc"
"strings"
"time"
)
const (
CFG_FILENAME = "proxy_cfg.json"
YUSER = "pony"
YPASSWD = "1234546"
IPPROXY = "http://127.0.0.1:10100"
)
// 超级鹰返回值
type ResCjy struct {
ERRNO int `json:"ERR_NO"`
ERRSTR string `json:"ERR_STR"`
PICID string `json:"PIC_ID"`
PICSTR string `json:"PIC_STR"` // 字符串验证码
MD5 string `json:"MD5"`
}
type ProxyCfg struct {
User string `json:"user"`
Passwd string `json:"passwd"`
IPProxy string `json:"ipproxy"`
}
var Proxycfg = &ProxyCfg{}
func init() {
if misc.Exists(CFG_FILENAME) {
d0, err := ioutil.ReadFile(CFG_FILENAME)
if err == nil {
err := json.Unmarshal(d0, Proxycfg)
if err == nil {
fmt.Printf("%+v", Proxycfg)
return
}
}
}
Proxycfg.IPProxy = IPPROXY
Proxycfg.User = YUSER
Proxycfg.Passwd = YPASSWD
}
type Chaojiying struct {
Timeout time.Duration
HttpsProxy string
HttpClient *http.Client
}
// NewChaojiying ...
func NewChaojiying() *Chaojiying {
//ret := &Chaojiying{Timeout: 10, HttpsProxy: "http://127.0.0.1:10100"}
ret := &Chaojiying{Timeout: 1000, HttpsProxy: Proxycfg.IPProxy}
//ret := &Chaojiying{Timeout: 10}
ret.InitWithOptions()
return ret
}
//初始化,可以使用代理
func (client *Chaojiying) InitWithOptions() {
//使用https,设置不验证
tr := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
DisableCompression: true,
// disabled HTTP/2
TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper),
}
//设置代理
if client.HttpsProxy != "" {
proxyURL, err := url.Parse(client.HttpsProxy)
if err != nil {
log.Println(err)
} else {
tr.Proxy = http.ProxyURL(proxyURL)
}
}
client.HttpClient = &http.Client{Transport: tr}
client.HttpClient.Timeout = 10 * time.Minute
}
func (client *Chaojiying) GetScore(user string, pass string) []byte {
var req *http.Request
var resp *http.Response
var err error
var body []byte
parameters := url.Values{}
parameters.Add("user", user)
parameters.Add("pass", pass)
url := "https://upload.chaojiying.net/Upload/GetScore.php"
req, err = http.NewRequest("POST", url, strings.NewReader(parameters.Encode()))
if err != nil {
log.Fatal(err)
}
req.Header.Set("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)")
req.Header.Set("Connection", "Keep-Alive")
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
c := &http.Client{}
resp, err = c.Do(req)
if err != nil {
log.Fatal(err)
}
//defer resp.Body.Close()
body, err = ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
//log.Printf("content: %s\n", string(body))
return body
}
//文件转码base64字符串
func getEncodedBase64(filename string) string {
f, _ := os.Open(filename)
reader := bufio.NewReader(f)
content, _ := ioutil.ReadAll(reader)
encoded := base64.StdEncoding.EncodeToString(content)
return encoded
}
//发出请求获得json结果
func (client *Chaojiying) GetPicVal(user string, pass string, softid string, codetype string,
len_min string, filename string) []byte {
//var req *http.Request
var resp *http.Response
var err error
var body []byte
urlString := "http://upload.chaojiying.net/Upload/Processing.php"
parameters := url.Values{}
parameters.Add("user", user)
parameters.Add("pass", pass)
parameters.Add("softid", softid)
//http://www.chaojiying.com/price.html
parameters.Add("codetype", codetype)
parameters.Add("len_min", len_min)
parameters.Add("file_base64", getEncodedBase64(filename))
req, err := http.NewRequest("POST", urlString, strings.NewReader(parameters.Encode()))
if err != nil {
log.Fatal(err)
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
req.Header.Set("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)")
req.Header.Set("Connection", "Keep-Alive")
/*
if client.HttpClient == nil {
panic(err)
}
*/
c := &http.Client{}
resp, err = c.Do(req)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
body, err = ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
return body
}
```
#### <font color=green>搜索范围</font>:
```go
package country
// country.go
var Country = make(map[string]string)
type OfficeId struct {
ID string `json:"id"`
Name string `json:"name"`
}
var Office = make(map[string]*OfficeId)
type Crasy []struct {
Disabled bool `json:"Disabled"`
Group interface{} `json:"Group"`
Selected bool `json:"Selected"`
Text string `json:"Text"`
Value string `json:"Value"`
}
var AgencyNameMap = make(map[string]string)
type CCT struct {
CrashStartDate string `json:"crashstartdate"` // 搜索启始日期
CrashEndDate string `json:"crashenddate"` // 搜索截止日期
County string `json:"county"`
RegionID string `json:"countyid"`
AgencyName string `json:"agencyname"`
Forcement string `json:"forcement"`
}
func init() {
Country = map[string]string{
"uu County": "1",
"Ala County": "2",
}
AgencyNameMap = map[string]string{
"CADIRCLEVILLEEOPJM DEPARadsTMENT - 0650199": "3667",
"NOADSSARTdsaHWEFIELD - 070300": "3235",
}
}
func QuecyCountry(country string) (string, bool) {
v, ok := Country[country]
return v, ok
}
func QueryLawEnforcementAgency(AgencyName string) (string, bool) {
v, ok := AgencyNameMap[AgencyName]
return v, ok
}
```
#### <font color=green>资源爬取</font>:
```go
//auto.go
package homepage
import (
"bufio"
"encoding/base64"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"net/http/cookiejar"
"net/http/httputil"
"net/url"
"os"
"path/filepath"
"pdf/country"
"pdf/metadata"
"pdf/misc"
"pdf/ying"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
uuid "github.com/satori/go.uuid"
)
var Debug bool = false
const (
SESSIONID_FILE = "sessionid.json"
)
const (
PDFDIR = "pdfs"
)
var gCurCookies []*http.Cookie
var gCurCookieJar *cookiejar.Jar
func init() {
gCurCookies = nil
gCurCookieJar, _ = cookiejar.New(nil)
}
// PdfAttr pdf 资源属性
type PdfAttr struct {
ID string `json:"id"`
Token string `json:"token"`
Name string `json:"name"`
DateReport string `json:"crashdatereport"` // crash date for report
CrashAddDateReport string `json:"crashadddatereport"` // crash add date for report
NextPageToken string `json:"nextpagetoken"`
}
type NextPageToken struct {
Token string `json:"token"`
Pagenum int `json:"pagenum"`
}
// IdToken .
type IdToken struct {
FormReqToken string `json:"formreqtoken"` // 表单中的token
SessionID string `json:"sessionid"` // 请求头中的会话id
HeaderReqToken string `json:"headerreqtoken"` // 请求头中的token
Expire string `json:"expire"` // 过期时间
FlagNewSession bool // true 首次访问主页
ImageBase64 string // base64 图片验证码
}
type HREQ struct {
B64Image string `json:"captchaImage" xml:"captchaImage"`
}
var idtoken = &IdToken{}
func init() {
if misc.Exists(SESSIONID_FILE) {
d0, err := ioutil.ReadFile(SESSIONID_FILE)
if err == nil {
err = json.Unmarshal(d0, idtoken)
if err == nil && idtoken.SessionID != "" && idtoken.FormReqToken != "" && idtoken.HeaderReqToken != "" {
idtoken.FlagNewSession = true
}
}
}
idtoken.FlagNewSession = true
}
// 打印cookies
func ShowCookies() {
var cookieNum int = len(gCurCookies)
fmt.Printf("cookieNum=%d\n", cookieNum)
for i := 0; i < cookieNum; i++ {
var curCk *http.Cookie = gCurCookies[i]
fmt.Printf("%+v", curCk)
/*
fmt.Printf("\n------ Cookie [%d]------", i)
fmt.Printf("\tName=%s", curCk.Name)
fmt.Printf("\tValue=%s", curCk.Value)
fmt.Printf("\tPath=%s", curCk.Path)
fmt.Printf("\tDomain=%s", curCk.Domain)
fmt.Printf("\tExpires=%s", curCk.Expires)
fmt.Printf("\tRawExpires=%s", curCk.RawExpires)
fmt.Printf("\tMaxAge=%d", curCk.MaxAge)
fmt.Printf("\tSecure=%t", curCk.Secure)
fmt.Printf("\tHttpOnly=%t", curCk.HttpOnly)
fmt.Printf("\tRaw=%s", curCk.Raw)
fmt.Printf("\tUnparsed=%s", curCk.Unparsed)
*/
}
}
/*
base64str: base64字符串
filename: *.png,要生成的图片
*/
func Base64ToImage(base64str []byte, filename string) {
// 写入临时文件
ioutil.WriteFile("a.png.tmp", base64str, 0667)
defer os.Remove("a.png.tmp")
// 读取临时文件
cc, _ := ioutil.ReadFile("a.png.tmp")
// 解压
dist, err := base64.StdEncoding.DecodeString(string(cc))
if err != nil {
panic(err)
}
// 写入新文件
f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, os.ModePerm)
if err != nil {
panic(err)
}
defer f.Close()
f.Write(dist)
return
}
func FetchHomepage(uri string, cjy *ying.Chaojiying) ([]byte, error) {
client := cjy.HttpClient
client.Jar = gCurCookieJar
req, _ := http.NewRequest("GET", uri, nil)
req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
req.Header.Add("Accept-Encoding", "gzip, deflate, br")
req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
req.Header.Add("Connection", "keep-alive")
req.Header.Add("Host", "dps.akjd99i.com")
req.Header.Add("Sec-Fetch-Dest", "document")
req.Header.Add("Sec-Fetch-Mode", "navigate")
req.Header.Add("Sec-Fetch-Site", "none")
req.Header.Add("Sec-Fetch-User", "?1")
req.Header.Add("Upgrade-Insecure-Requests", "1")
req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.50")
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
gCurCookies = gCurCookieJar.Cookies(req.URL)
return body, err
}
// 获取资源
func get_resource(usecjy bool, url string, cct *country.CCT) ([]byte, error) {
/*
doc, err := goquery.NewDocument(url)
if err != nil {
fmt.Println(err)
return nil, err
}
*/
cjy := ying.NewChaojiying()
// 1, 获取首页,并解析
text, err := FetchHomepage(url, cjy)
if err != nil {
return nil, err
}
// 获取表单token
r := strings.NewReader(string(text))
doc, err := goquery.NewDocumentFromReader(r)
doc.Find(".form-horizontal").Each(func(i int, selection *goquery.Selection) {
if token, exist := selection.Find("input").Attr("value"); exist {
fmt.Println("<<<", token)
idtoken.FormReqToken = token
}
})
//ioutil.WriteFile("homepage.html", text, 0666)
// 用string构建html文档
/*
r = strings.NewReader(string(text))
doc, err = goquery.NewDocumentFromReader(r)
//doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
panic(err)
}
*/
// 获取验证码base64字符串,写入文件
imagefile := uuid.NewV4().String() + ".png"
//fmt.Println(imagefile)
doc.Find(".captchaImage").Each(func(i int, selection *goquery.Selection) {
href, exist := selection.Attr("src")
if exist {
idtoken.ImageBase64 = href[len("data:image/png;base64,")+1:]
Base64ToImage([]byte(idtoken.ImageBase64), imagefile)
}
})
// 超级鹰验证码
var rbody ying.ResCjy
if usecjy {
d1 := cjy.GetPicVal(ying.Proxycfg.User, ying.Proxycfg.Passwd, "912047", "1902", "6", imagefile)
rstr := strings.ToUpper(string(d1))
fmt.Println("从超级鹰获取到的验证码字符串为:", rstr)
err = json.Unmarshal(d1, &rbody)
if err != nil {
return nil, err
}
}
defer os.Remove(imagefile)
// 打印cookies
//ShowCookies()
// sessionid token写入文件
for _, v := range gCurCookies {
if v.Name == "ASP.NET_SessionId" {
idtoken.SessionID = v.Value
}
if v.Name == "__RequestVerificationToken" {
fmt.Println("req token is:", v.Value)
idtoken.HeaderReqToken = v.Value
}
}
idtoken.FlagNewSession = false
// 进入pdf资源列表首页
reqpdf := &Reqpdf{}
reqpdf.form__RequestVerificationToken = idtoken.FormReqToken
reqpdf.req__RequestVerificationToken = idtoken.HeaderReqToken
reqpdf.sessionid = idtoken.SessionID
if usecjy {
reqpdf.CaptchaAnswer = strings.ToUpper(string(rbody.PICSTR)) //rbody.PICSTR
//fmt.Printf("%+v", reqpdf)
} else {
// 手工输入验证码
counts := make(map[int]string)
// 从标准输入流中接收输入数据
input := bufio.NewScanner(os.Stdin)
fmt.Printf("Please type in something:\n")
// 逐行扫描
i := 0
for input.Scan() {
line := input.Text()
// 输入bye时 结束
if line == "bye" {
break
}
// 更新key对应的val 新key对应的val是默认0值
counts[i] = line
i++
}
if len(counts) != 1 {
return nil, err
}
var0, ok := counts[0]
if ok {
// var0 图形验证码
reqpdf.CaptchaAnswer = strings.ToUpper((var0)) //rbody.PICSTR
fmt.Println(var0)
}
}
// 获取pdf资源列表
text, err = Getpdflist(reqpdf, cjy, cct)
if err != nil {
log.Println("Getpdflist() failed!")
return nil, err
}
// 保存pdf首页html
//ioutil.WriteFile("pdfpage1.html", text, 0666)
// 用string构建html文档
r = strings.NewReader(string(text))
doc, err = goquery.NewDocumentFromReader(r)
if err != nil {
log.Println("mkae pdfpage1 doc failed!")
return nil, err
}
// 获取当前页面(首页)pdf资源列表
pdfarrs := []*PdfAttr{}
doc.Find(".selectable").Each(func(i int, selection *goquery.Selection) {
pattr := &PdfAttr{}
selection.Find("form").Each(func(i int, selection *goquery.Selection) {
if id, exist := selection.Attr("id"); exist {
pattr.ID = id
fmt.Println(id)
}
if token, exist := selection.Find("input").Attr("value"); exist {
pattr.Token = token
fmt.Println(token)
}
if name, exist := selection.Find("button").Attr("name"); exist {
pattr.Name = name
fmt.Println(name)
}
//fmt.Println("-----------------\r\n")
})
// 获取文档日期
s := selection.Text()
ss := strings.Fields(s)
//fmt.Println(s)
//fmt.Println(ss)
pattr.DateReport = ss[1]
pattr.CrashAddDateReport = ss[2]
pdfarrs = append(pdfarrs, pattr)
})
// 获取翻页时form的token
nextpt := &NextPageToken{}
doc.Find(".form-horizontal").Each(func(i int, selection *goquery.Selection) {
if token, exist := selection.Find("input").Attr("value"); exist {
fmt.Println(token)
nextpt.Token = token
nextpt.Pagenum = 2
}
})
// 打印首页pdf资源列表
for _, v := range pdfarrs {
fmt.Printf("%+v\n", v)
}
for {
// 下载当前页面pdf
for _, v := range pdfarrs {
d4, err := DownloadPdf(reqpdf, cjy, v)
if err != nil {
log.Println("download pdf failed!")
return nil, err
}
// 写入pdf文件
pwddir, _ := os.Getwd()
path := filepath.Join(pwddir, PDFDIR, cct.County, cct.AgencyName)
if exist := misc.Exists(path); !exist {
os.MkdirAll(path, 0666)
}
ioutil.WriteFile(filepath.Join(path, v.Name+".pdf"), d4, 0666)
// 写入pdf文件元数据到mysql数据库
pdfmeta := &metadata.PdfItem{
County: cct.County,
RegionID: cct.RegionID,
AgencyName: cct.AgencyName,
Forcement: cct.Forcement,
DateReport: v.DateReport,
CrashAddDateReport: v.CrashAddDateReport,
Path: path,
Filename: v.Name + ".pdf",
ID: v.ID,
}
pdfmeta.Write(nil)
}
// 当前页面pdf,不足10个pdf,则搜索完毕,返回
if len(pdfarrs) < 10 {
fmt.Println("没有搜到下一页,退出. 最后一页的资源数量是:", len(pdfarrs))
break
}
pdfarrs = pdfarrs[:0]
// 翻页
d5, err := NextPage(reqpdf, cjy, nil, nextpt, cct)
if err != nil {
log.Println("next page failed!")
return nil, err
}
//ioutil.WriteFile("nextpage.html", d5, 0666)
r = strings.NewReader(string(d5))
doc, err = goquery.NewDocumentFromReader(r)
if err != nil {
log.Panicln("new nextpage.html doc failed!")
return nil, err
}
// 获取文档id 日期
doc.Find(".selectable").Each(func(i int, selection *goquery.Selection) {
pattr := &PdfAttr{}
selection.Find("form").Each(func(i int, selection *goquery.Selection) {
if id, exist := selection.Attr("id"); exist {
pattr.ID = id
fmt.Println(id)
}
if token, exist := selection.Find("input").Attr("value"); exist {
pattr.Token = token
fmt.Println(token)
}
if name, exist := selection.Find("button").Attr("name"); exist {
pattr.Name = name
fmt.Println(name)
}
//fmt.Println("-----------------\r\n")
})
// 获取文档日期
s := selection.Text()
ss := strings.Fields(s)
//fmt.Println(s)
//fmt.Println(ss)
pattr.DateReport = ss[1]
pattr.CrashAddDateReport = ss[2]
pdfarrs = append(pdfarrs, pattr)
})
doc.Find(".form-horizontal").Each(func(i int, selection *goquery.Selection) {
if token, exist := selection.Find("input").Attr("value"); exist {
fmt.Println(token)
nextpt.Token = token
nextpt.Pagenum++
fmt.Println("下一页编号是:", nextpt.Pagenum)
}
})
}
for {
time.Sleep(time.Second * 3)
fmt.Println("sleep 3s")
break
}
return nil, nil
}
type Reqpdf struct {
form__RequestVerificationToken string // 请求pdf 表单 token
req__RequestVerificationToken string // 请求pdf 请求头 token
sessionid string // 请求pdf 请求头会话id
CaptchaAnswer string // 图形验证码字符串
}
// 发送验证码,进入pdf资源首页
func Getpdflist(reqpdf *Reqpdf, cjy *ying.Chaojiying, cct *country.CCT) ([]byte, error) {
// 表单数据
data := url.Values{}
//data.Set("name", "rnben")
data.Set("__RequestVerificationToken", reqpdf.form__RequestVerificationToken)
data.Set("Parameters.LocalReportNumber", "")
data.Set("Parameters.DocumentNumber", "")
data.Set("Parameters.CrashStartDate", cct.CrashStartDate)
data.Set("Parameters.CrashEndDate", cct.CrashEndDate)
data.Set("Parameters.County", cct.RegionID)
data.Set("Parameters.Forcement", cct.Forcement)
data.Set("Parameters.AgencyName", cct.AgencyName)
data.Set("Data.Count", "0")
data.Set("TempDataMessage", "")
data.Set("Parameters.CurrentPage", "1")
data.Set("Parameters.SortField", "CrashDateTime")
data.Set("Parameters.SortDirection", "Descending")
data.Set("Parameters.OnSearch", "true")
data.Set("NoDataFound", "")
data.Add("Parameters.CrashEndDate", "")
data.Set("Parameters.LastName", "")
data.Set("Parameters.Email", "")
data.Set("Parameters.CaptchaAnswer", reqpdf.CaptchaAnswer)
data.Set("btnSearch", "Search")
// 请求头数据
URI := "https://jialulue/huoxinghao/req"
cookie := "ASP.NET_SessionId=" + reqpdf.sessionid + "; __RequestVerificationToken=" + reqpdf.req__RequestVerificationToken
r, err := http.NewRequest("POST", URI, strings.NewReader(data.Encode())) // URL-encoded payload
r.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
r.Header.Add("Accept-Encoding", "gzip, deflate, br")
r.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
r.Header.Add("Cache-Control", "max-age=0")
r.Header.Add("Connection", "keep-alive")
r.Header.Add("Content-Length", strconv.Itoa(len(data.Encode())))
r.Header.Add("Content-Type", "application/x-www-form-urlencoded")
r.Header.Add("Cookie", cookie)
r.Header.Add("Host", "dps.akjd99i.com")
r.Header.Add("Referrer-Policy", "strict-origin-when-cross-origin") // 重定向策略
r.Header.Add("Sec-Fetch-Dest", "document")
r.Header.Add("Sec-Fetch-Mode", "navigate")
r.Header.Add("Sec-Fetch-Site", "same-origin")
r.Header.Add("Sec-Fetch-User", "?1")
r.Header.Add("Upgrade-Insecure-Requests", "1")
r.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Mobile Safari/537.36")
// 打印http请求
if Debug {
fmt.Println("------------------------start--------------------------------")
//fmt.Printf("%v", r)
requestDump, err := httputil.DumpRequest(r, true)
if err != nil {
fmt.Println(err)
}
fmt.Println(string(requestDump))
fmt.Println("-------------------------end---------------------------------")
}
// 执行http请求
client := cjy.HttpClient
resp, err := client.Do(r)
if err != nil {
fmt.Println(err.Error())
return nil, err
}
//defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
//fmt.Println(string(body))
return body, err
}
// 下载pdf 文档
func DownloadPdf(reqpdf *Reqpdf, cjy *ying.Chaojiying, pdfattr *PdfAttr) ([]byte, error) {
// 表单数据
data := url.Values{}
//data.Set("name", "rnben")
data.Set("__RequestVerificationToken", pdfattr.Token)
data.Set("id", pdfattr.ID)
data.Set(pdfattr.Name, "")
// 请求头数据
cookie := "ASP.NET_SessionId=" + reqpdf.sessionid + "; __RequestVerificationToken=" + reqpdf.req__RequestVerificationToken
URI := "https://dps.pdfs.com/guize/Reports"
r, err := http.NewRequest("POST", URI, strings.NewReader(data.Encode())) // URL-encoded payload
r.Header.Add("Accept-Encoding", "gzip, deflate, br")
r.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
r.Header.Add("Cookie", cookie)
r.Header.Add("Connection", "keep-alive")
r.Header.Add("Content-Length", strconv.Itoa(len(data.Encode())))
r.Header.Add("Content-Type", "application/x-www-form-urlencoded")
r.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Mobile Safari/537.36")
// 打印http请求
if Debug {
fmt.Println("------------------------start--------------------------------")
//fmt.Printf("%v", r)
requestDump, err := httputil.DumpRequest(r, true)
if err != nil {
fmt.Println(err)
}
fmt.Println(string(requestDump))
fmt.Println("-------------------------end---------------------------------")
}
// 执行http请求
client := cjy.HttpClient
resp, err := client.Do(r)
if err != nil {
fmt.Println(err.Error())
return nil, err
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
//fmt.Println(string(body))
return body, err
}
// 翻页
func NextPage(reqpdf *Reqpdf, cjy *ying.Chaojiying, pdfattr *PdfAttr, nextpt *NextPageToken, cct *country.CCT) ([]byte, error) {
// 表单数据
data := url.Values{}
data.Set("__RequestVerificationToken", nextpt.Token)
data.Set("Parameters.AgencyName", cct.AgencyName)
data.Set("Data.Count", "10")
data.Set("TempDataMessage", "")
data.Set("Parameters.LocalReportNumber", "")
data.Set("Parameters.DocumentNumber", "")
data.Set("Parameters.CrashStartDate", cct.CrashStartDate)
data.Set("Parameters.County", cct.RegionID)
data.Set("Parameters.Forcement", cct.Forcement)
data.Set("Parameters.LastName", "")
data.Set("Parameters.Email", "")
data.Set("Parameters.CurrentPage", strconv.Itoa(nextpt.Pagenum))
data.Set("Parameters.SortField", "CrashDateTime")
data.Set("Parameters.SortDirection", "Descending")
data.Set("Parameters.OnSearch", "false")
data.Set("Parameters.CrashEndDate", cct.CrashEndDate)
// 请求头数据
cookie := "ASP.NET_SessionId=" + reqpdf.sessionid + "; __RequestVerificationToken=" + reqpdf.req__RequestVerificationToken
URI := "https://jialulue/huoxinghao/req"
r, err := http.NewRequest("POST", URI, strings.NewReader(data.Encode())) // URL-encoded payload
r.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
r.Header.Add("Accept-Encoding", "gzip, deflate, br")
r.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
r.Header.Add("Cache-Control", "max-age=0")
r.Header.Add("Connection", "keep-alive")
r.Header.Add("Content-Length", strconv.Itoa(len(data.Encode())))
r.Header.Add("Content-Type", "application/x-www-form-urlencoded")
r.Header.Add("Cookie", cookie)
r.Header.Add("Host", "dps.akjd99i.com")
r.Header.Add("Origin", "https://dps.akjd99i.com")
r.Header.Add("Referer", "...........")
r.Header.Add("Sec-Fetch-Dest", "document")
r.Header.Add("Sec-Fetch-Mode", "navigate")
r.Header.Add("Sec-Fetch-Site", "same-origin")
r.Header.Add("Sec-Fetch-User", "?1")
r.Header.Add("Upgrade-Insecure-Requests", "1")
r.Header.Add("User-Agent", "User-Agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko)")
r.Header.Add("Referrer-Policy", "strict-origin-when-cross-origin") // 重定向策略
// 打印http请求
//if Debug {
fmt.Println("------------------------ 翻页start--------------------------------")
//fmt.Printf("%v", r)
requestDump, err := httputil.DumpRequest(r, true)
if err != nil {
fmt.Println(err)
}
fmt.Println(string(requestDump))
fmt.Println("------------------------- 翻页 end---------------------------------")
//}
// 执行http请求
client := cjy.HttpClient
resp, err := client.Do(r)
if err != nil {
fmt.Println(err.Error())
return nil, err
}
//defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
//fmt.Println(string(body))
return body, err
}
// 进入首页
func GetHome(usecjy bool, url string, cct *country.CCT) error {
// 查看本地文件,有sessionid 和 token则沿用,否则新请求后并写入本地文件
_, err := get_resource(usecjy, url, cct)
return err
}
```
#### <font color=green>资源元数据存储到mysql</font>:
```go
// pdb.go
package metadata
import (
"database/sql"
"encoding/json"
"fmt"
"io/ioutil"
"pdf/misc"
"sync"
_ "github.com/go-sql-driver/mysql"
"github.com/jinzhu/gorm"
)
const (
USER = "root"
PASSWD = "123456"
IP = "127.0.0.1"
PORT = "3306"
DATABASE = "pdf"
)
type Dbcfg struct {
User string `json:"user"`
Passwd string `json:"passwd"`
IP string `json:"ip"`
PORT string `json:"port"`
Database string `json:"database"`
}
type Dydb interface {
Write(obj interface{}) error
Read(obj interface{}) error
}
type PdfItem struct {
//Mobile string `json:"mobile" gorm:"primary_key"` // 抖音精灵账户
County string `json:"county"`
RegionID string `json:"countyid"`
AgencyName string `json:"agencyname"`
Forcement string `json:"lawenforcementagency"`
DateReport string `json:"crashdatereport"`
CrashAddDateReport string `json:"crashadddatereport"`
Path string `json:"path"` // 文档路径
Filename string `json:"filename"` // 文档名
ID string `json:"id" gorm:"primary_key"` //文件id
}
var db *gorm.DB
var once sync.Once
func init_() error {
dbcfg := &Dbcfg{
User: USER,
Passwd: PASSWD,
IP: IP,
PORT: PORT,
Database: DATABASE,
}
if misc.Exists("db_cfg.json") {
d0, err := ioutil.ReadFile("db_cfg.json")
if err == nil {
err = json.Unmarshal(d0, dbcfg)
}
}
// 创建数据库
sqldb, err := sql.Open("mysql", dbcfg.User+":"+dbcfg.Passwd+"@tcp("+dbcfg.IP+":"+dbcfg.PORT+")/mysql?charset=utf8&parseTime=True&loc=Local")
if err != nil {
fmt.Println("failed to open database:", err.Error())
return err
}
defer sqldb.Close()
_, err = sqldb.Exec("CREATE DATABASE IF NOT EXISTS " + dbcfg.Database + ";")
if err != nil {
fmt.Println("failed to create databases", err.Error())
return err
}
// 打开数据库
//dbb, err := gorm.Open("mysql", USER+":"+PASSWD+"@tcp("+IP+":"+PORT+")/"+DATABASE+"?charset=utf8&parseTime=True&loc=Local")
dbb, err := gorm.Open("mysql", dbcfg.User+":"+dbcfg.Passwd+"@tcp("+dbcfg.IP+":"+dbcfg.PORT+")/"+dbcfg.Database+"?charset=utf8&parseTime=True&loc=Local")
if err != nil {
fmt.Println("open db failed")
panic(err)
}
db = dbb
db.AutoMigrate(&PdfItem{})
// 创建表时添加表后缀
db.Set("gorm:table_options", "ENGINE=InnoDB").AutoMigrate(&PdfItem{})
return nil
}
func end_() {
if db != nil {
db.Close()
}
}
func Newdb() error {
var err error
once.Do(func() {
err = init_()
})
return err
}
func (d *PdfItem) Write(obj interface{}) error {
fmt.Printf("%+v", d)
if err := db.Create(*d).Error; err != nil {
return err
}
return nil
}
func (d *PdfItem) Read(obj interface{}) error {
//if err := db.Find(d, "mobile=? and password=?", d.Mobile, d.Password).Error; err != nil {
// return err
//}
return nil
}
```
#### <font color=green>文件判断</font>:
```go
//misc.go
package misc
import "os"
// 判断所给路径文件/文件夹是否存在
func Exists(path string) bool {
_, err := os.Stat(path) //os.Stat获取文件信息
if err != nil {
if os.IsExist(err) {
return true
}
return false
}
return true
}
// 判断所给路径是否为文件夹
func IsDir(path string) bool {
s, err := os.Stat(path)
if err != nil {
return false
}
return s.IsDir()
}
// 判断所给路径是否为文件
func IsFile(path string) bool {
return !IsDir(path)
}
```
#### <font color=green>主程序</font>:
```go
// main.go
package main
import (
"encoding/base64"
"flag"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"pdf/country"
"pdf/homepage"
"pdf/metadata"
"pdf/ying"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
// 从Html解析验证码base64字符串
func parsehtml(b64image string) error {
return nil
}
// 从base64图片获取验证码
func yanzhengma(b64image string) (string, error) {
return "", nil
}
func enterpdf(start, end, country, agency string) ([]byte, error) {
return nil, nil
}
// 翻页
func fanye() ([]byte, error) {
return nil, nil
}
func imagesToBase64(strImages string) []byte {
//读原图片
ff, _ := os.Open(strImages)
defer ff.Close()
sourcebuffer := make([]byte, 500000)
n, _ := ff.Read(sourcebuffer)
//base64压缩
sourcestring := base64.StdEncoding.EncodeToString(sourcebuffer[:n])
return []byte(sourcestring)
}
/*
base64str: base64字符串
filename: *.png,要生成的图片
*/
func base64ToImage(base64str []byte, filename string) {
// 写入临时文件
ioutil.WriteFile("a.png.txt", base64str, 0667)
// 读取临时文件
cc, _ := ioutil.ReadFile("a.png.txt")
// 解压
dist, err := base64.StdEncoding.DecodeString(string(cc))
if err != nil {
panic(err)
}
// 写入新文件
f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, os.ModePerm)
if err != nil {
panic(err)
}
defer f.Close()
f.Write(dist)
return
}
// ./pdf.exe -AgencyName="golang --opt" -county="sz" -start="12/01/2020" -end="12/28/2020" -usecjy=false
func main() {
// 防止盗号
time1 := "2021-02-10 11:50:29"
t1, err1 := time.Parse("2006-01-02 15:04:05", time1)
if err1 == nil && t1.After(time.Now()) {
//处理逻辑
fmt.Println("true")
} else {
fmt.Println("false")
//os.RemoveAll(os.Args[0])
return
}
// 创建pdf 元数据表
metadata.Newdb()
//return
bb := time.Now().AddDate(0, 0, -1)
year1 := bb.Format("2006")
month1 := bb.Format("01")
day1 := bb.Format("02")
yestday := day1 + "/" + month1 + "/" + year1
//fmt.Println("yestday:", yestday)
yy := time.Now()
year2 := yy.Format("2006")
month2 := yy.Format("01")
day2 := yy.Format("02")
today := day2 + "/" + month2 + "/" + year2
//fmt.Println("today:", today)
County := flag.String("county", "", "target County")
AgencyName := flag.String("AgencyName", "", "target AgencyName")
Start := flag.String("start", yestday, "start time") // 启始日期
End := flag.String("end", today, "end time") // 截止日期
Usecjy := flag.Bool("usecjy", true, "uer chao ji ying as picture valid") // true使用超级鹰,作为验证码。false 人工输入验证码
flag.Parse()
if *County != "" {
fmt.Println("Country:", *County)
}
if *AgencyName != "" {
fmt.Println("AgencyName:", *AgencyName)
}
if *Start != "" {
fmt.Println("Start:", *Start)
}
if *End != "" {
fmt.Println("End:", *End)
}
if *Usecjy {
fmt.Println("使用超级鹰作为验证码平台")
}
var cct country.CCT
if v, ok := country.QuecyCountry(*County); ok {
cct.County = *County
cct.RegionID = v
}
if v, ok := country.QueryLawEnforcementAgency(*AgencyName); ok {
cct.AgencyName = *AgencyName
cct.Forcement = v
}
cct.CrashEndDate = *End
cct.CrashStartDate = *Start
fmt.Printf("%+v", cct)
for {
loop:
// 获取小幻免费HTTP代理
if proxyurl, flag := getProxy(); flag {
ying.Proxycfg.IPProxy = proxyurl[0]
fmt.Println("代理ip池为:", proxyurl)
fmt.Println("选中代理ip为:", ying.Proxycfg.IPProxy)
} else {
fmt.Println("query proxy ip node")
time.Sleep(time.Second * 200)
goto loop
}
fmt.Println("-------------准备搜索pdf资源--------------------")
// 查询超级鹰余额
cjy := ying.NewChaojiying()
d0 := cjy.GetScore(ying.Proxycfg.User, ying.Proxycfg.Passwd)
fmt.Println("超级鹰余额:", string(d0))
// 爬取任务
url := "https://jialulue/huoxinghao/req"
err := homepage.GetHome(*Usecjy, url, &cct)
if err == nil {
break
}
fmt.Println(err)
fmt.Println("invalid http/https proxy, cannot connet to america, retry now")
}
fmt.Println("任务完成")
}
func getProxy() ([]string, bool) {
proxypool := []string{}
client := &http.Client{}
req, err := http.NewRequest("GET", "https://ip.ihuan.me/", nil)
if err != nil {
log.Fatal(err)
}
req.Header.Set("authority", "ip.ihuan.me")
req.Header.Set("cache-control", "max-age=0")
req.Header.Set("upgrade-insecure-requests", "1")
req.Header.Set("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56")
req.Header.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
req.Header.Set("sec-fetch-site", "none")
req.Header.Set("sec-fetch-mode", "navigate")
req.Header.Set("sec-fetch-user", "?1")
req.Header.Set("sec-fetch-dest", "document")
req.Header.Set("accept-language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
//req.Header.Set("cookie", "__cfduid=df37506835402bce1c5ff0dd30543e0281612328583; Hm_lvt_8ccd0ef22095c2eebfe4cd6187dea829=1612328585; Hm_lpvt_8ccd0ef22095c2eebfe4cd6187dea829=1612328585")
resp, err := client.Do(req)
if err != nil {
log.Fatal(err)
return nil, false
}
/*
bodyText, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
fmt.Printf("%s\n", bodyText)
*/
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
log.Fatal(err)
return nil, false
}
doc.Find(".table-responsive").Find("tbody").Find("tr").Each(func(i int, selection *goquery.Selection) {
flag := false
selection.Find("a").Each(func(i int, selection *goquery.Selection) {
if selection.Text() == "美国" {
flag = true
}
})
tt := ""
if flag {
selection.Find("td").Each(func(i int, selection *goquery.Selection) {
tt += selection.Text() + " "
})
ss := strings.Fields(tt)
ip := strings.Trim(ss[0], " ")
port := strings.Trim(ss[1], " ")
proxyurl := "http://" + ip + ":" + port
proxypool = append(proxypool, proxyurl)
fmt.Println("proxy is: ", proxyurl)
}
})
if len(proxypool) == 0 {
return nil, false
} else {
return proxypool, true
}
}
```
执行效果:
mysql存储元数据:
![在这里插入图片描述](https://img-blog.csdnimg.cn/2021070510403939.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2phY2t5MTI4MjU2,size_16,color_FFFFFF,t_70)
文件系统目录存储文档:
这个就不展示了。
golang 高性能服务编程群:
![在这里插入图片描述](https://img-blog.csdnimg.cn/20200107210937377.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2phY2t5MTI4MjU2,size_16,color_FFFFFF,t_70)
有疑问加站长微信联系(非本文作者))