使用GO语言爬取捧腹网的笑话

cen9jing · · 678 次点击 · · 开始浏览    
这是一个创建于 的文章,其中的信息可能已经有所发展或是发生改变。

效果图

tips:当出现换行或空格等情况可以使用strings.Replace(content,"\n","",-1)来替换

源码

package main

import (
   "fmt"
   "io"
   "net/http"
   "os"
   "regexp"
   "strconv"
   "strings"
)

func HttpGet(url string)(result string,err error){
   rep ,err1 := http.Get(url)
   if err1 != nil{
      err = err1
      return
   }
   defer rep.Body.Close()
   buf := make([]byte,4096)
   for {
      n,err2 := rep.Body.Read(buf)
      if n==0 {
         break
      }
      if err2!=nil && err2!=io.EOF{
         err =err2
         return
      }
      result += string(buf[:n])
   }
   return
}

func SpderPage(index int,page chan int){
   url := "https://www.pengfue.com/xiaohua_"+strconv.Itoa(index)+".html"
   result , err := HttpGet(url)
   if err!=nil{
      fmt.Println("HttpGet err:",err)
      return
   }
   ret := regexp.MustCompile(`<a href="https://www.pengfue.com/content_(.*?).html" target="_blank">`)
   alls := ret.FindAllStringSubmatch(result,-1)
   fileTitle := make([]string,0)
   filecontent := make([]string,0)
   for _,jokeurl := range alls{
      title,content,err := SpiderJokePage("https://www.pengfue.com/content_"+jokeurl[1]+".html")
      if err != nil{
         fmt.Println("SpiderJokePage err:",err)
         continue
      }
      /*
      fmt.Println("title: ",title)
      fmt.Println("content: ",content)
       */
      fileTitle = append(fileTitle,title)
      filecontent = append(filecontent,content)
   }
   SaveJokeFile(index,fileTitle,filecontent)
   page <- index
}
func SaveJokeFile(index int,fileTitle,filecontent []string){
   strpath,_:= os.Getwd()
   path := strpath+"/第"+strconv.Itoa(index)+"页.txt"
   f,err := os.Create(path)
   if err !=nil{
      fmt.Printf("Http get :",err)
   }
   defer f.Close()
   n := len(fileTitle)
   for i:=0;i<n;i++{
      f.WriteString(fileTitle[i]+"\n"+filecontent[i]+"\n")
      f.WriteString("------------分割线--------------\n")
   }

}
func SpiderJokePage(url string) (title,content string,err error){
   result , err := HttpGet(url)
   if err!=nil{
      fmt.Println("HttpGet err:",err)
      return
   }
   ret1 := regexp.MustCompile(`<h1>(?s:(.*?))</h1>`)
   alls := ret1.FindAllStringSubmatch(result,1)
   for _,tmpTitle := range alls{
      title = tmpTitle[1]
      title = strings.Replace(title,"\t","",-1)
      break
   }
   ret2 := regexp.MustCompile(`<div class="content-txt pt10">(?s:(.*?)[^<img])<a id="prev" href=`)
   allss := ret2.FindAllStringSubmatch(result,1)
   for _,tmpConten := range allss{
      content = tmpConten[1]
      content = strings.Replace(content,"\n","",-1)
      content = strings.Replace(content,"\t","",-1)
      content = strings.Replace(content,"&nbsp;","",-1)
      break
   }
   return
}

func toWork(start int,end int)(){
   fmt.Printf("正在爬取第%d到第%d页\n",start,end)
   page := make(chan int)
   for i:=start;i<=end;i++{
      go SpderPage(i,page)
   }
   for i:=start;i<=end;i++{
      fmt.Printf("第%d页爬取完毕\n",<-page)
   }
}
func main(){
   var start,end int
   fmt.Println("请输入起始页(>=1)")
   fmt.Scan(&start)
   fmt.Println("请输入结束页(>=start)")
   fmt.Scan(&end)
   toWork(start,end)
}

有疑问加站长微信联系(非本文作者)

入群交流(和以上内容无关):加入Go大咖交流群,或添加微信:liuxiaoyan-s 备注:入群;或加QQ群:692541889

678 次点击  ∙  2 赞  
加入收藏 微博
2 回复  |  直到 2020-07-23 13:08:11
暂无回复
添加一条新回复 (您需要 登录 后才能回复 没有账号 ?)
  • 请尽量让自己的回复能够对别人有帮助
  • 支持 Markdown 格式, **粗体**、~~删除线~~、`单行代码`
  • 支持 @ 本站用户;支持表情(输入 : 提示),见 Emoji cheat sheet
  • 图片支持拖拽、截图粘贴等方式上传