效果图
tips:当出现换行或空格等情况可以使用strings.Replace(content,"\n","",-1)来替换
源码
package main import ( "fmt" "io" "net/http" "os" "regexp" "strconv" "strings" ) func HttpGet(url string)(result string,err error){ rep ,err1 := http.Get(url) if err1 != nil{ err = err1 return } defer rep.Body.Close() buf := make([]byte,4096) for { n,err2 := rep.Body.Read(buf) if n==0 { break } if err2!=nil && err2!=io.EOF{ err =err2 return } result += string(buf[:n]) } return } func SpderPage(index int,page chan int){ url := "https://www.pengfue.com/xiaohua_"+strconv.Itoa(index)+".html" result , err := HttpGet(url) if err!=nil{ fmt.Println("HttpGet err:",err) return } ret := regexp.MustCompile(`<a href="https://www.pengfue.com/content_(.*?).html" target="_blank">`) alls := ret.FindAllStringSubmatch(result,-1) fileTitle := make([]string,0) filecontent := make([]string,0) for _,jokeurl := range alls{ title,content,err := SpiderJokePage("https://www.pengfue.com/content_"+jokeurl[1]+".html") if err != nil{ fmt.Println("SpiderJokePage err:",err) continue } /* fmt.Println("title: ",title) fmt.Println("content: ",content) */ fileTitle = append(fileTitle,title) filecontent = append(filecontent,content) } SaveJokeFile(index,fileTitle,filecontent) page <- index } func SaveJokeFile(index int,fileTitle,filecontent []string){ strpath,_:= os.Getwd() path := strpath+"/第"+strconv.Itoa(index)+"页.txt" f,err := os.Create(path) if err !=nil{ fmt.Printf("Http get :",err) } defer f.Close() n := len(fileTitle) for i:=0;i<n;i++{ f.WriteString(fileTitle[i]+"\n"+filecontent[i]+"\n") f.WriteString("------------分割线--------------\n") } } func SpiderJokePage(url string) (title,content string,err error){ result , err := HttpGet(url) if err!=nil{ fmt.Println("HttpGet err:",err) return } ret1 := regexp.MustCompile(`<h1>(?s:(.*?))</h1>`) alls := ret1.FindAllStringSubmatch(result,1) for _,tmpTitle := range alls{ title = tmpTitle[1] title = strings.Replace(title,"\t","",-1) break } ret2 := regexp.MustCompile(`<div class="content-txt pt10">(?s:(.*?)[^<img])<a id="prev" href=`) allss := ret2.FindAllStringSubmatch(result,1) for _,tmpConten := range allss{ content = tmpConten[1] content = strings.Replace(content,"\n","",-1) content = strings.Replace(content,"\t","",-1) content = strings.Replace(content," ","",-1) break } return } func toWork(start int,end int)(){ fmt.Printf("正在爬取第%d到第%d页\n",start,end) page := make(chan int) for i:=start;i<=end;i++{ go SpderPage(i,page) } for i:=start;i<=end;i++{ fmt.Printf("第%d页爬取完毕\n",<-page) } } func main(){ var start,end int fmt.Println("请输入起始页(>=1)") fmt.Scan(&start) fmt.Println("请输入结束页(>=start)") fmt.Scan(&end) toWork(start,end) }
有疑问加站长微信联系(非本文作者)