一个简单的爬虫小程序-golang

石鸟路遇 · · 668 次点击 · · 开始浏览    
这是一个创建于 的文章,其中的信息可能已经有所发展或是发生改变。

聚合成MP4 需要系统预装ffmpeg

package main

/*

* @auth:hjx

* 2020-03-29

*/

import (

"flag"

"fmt"

"io"

"io/ioutil"

"log"

"net/http"

"os"

"os/exec"

"regexp"

"runtime"

"strconv"

"strings"

"sync"

"time"

)

const (

PREFIX_INFO    ="hjx->[INFO]:"

  PREFIX_ERROR  ="hjx->[ERROR]:"

  PREFIX_WARNING ="hjx->[WARNING]:"

  TARGET_URL    ="http://www.askme.com/"

)

var disk =flag.String("d", "e", "Stored disk path")

var folder =flag.String("f", "goqbl", "Stored folder path default e:/goqbl")

var mergerFolder =flag.String("m", "MV", "Stored merger mp4 path default e:/goqbl/MV")

var page =flag.Int("p", 1, "Stored the spider mv nums 1*24 ; default 1*24")

var downNum =flag.Int64("n", 6, "The default maximum number of concurrent ,default 6")

var targetlist []map[string]string

var locksync.Mutex

var wgsync.WaitGroup

func main() {

fmt.Println("Buddha bless hjxSpider program no bugs,Start now")

flag.Parse()

runtime.GOMAXPROCS(runtime.NumCPU())

filePath := *disk +":/" + *folder

mergerFolderPath := filePath +"/" + *mergerFolder

logFile := filePath +"/" +"spider.log"

  downedFile := filePath +"/" +"downed.log"

  //crearDIR

  makeWorkFolder :=createDir(mergerFolderPath, logFile)

if makeWorkFolder ==false {

hlog(logFile, PREFIX_ERROR, "An unexpected error occurred while creating the folder", mergerFolderPath, "the program is about to exit")

os.Exit(1)

}

hlog(logFile, PREFIX_INFO, "Please wait while getting the current connection")

getTargetPageURL(*page, logFile)

if len(targetlist) ==0 {

hlog(logFile, PREFIX_ERROR, "Did not get titles and links the program is about to exit")

os.Exit(1)

}

listGroup :=splitQBLArray(targetlist, *downNum)

threadFlag :=0

  for _, mapList :=range listGroup {

threadNum := threadFlag +1

      threadFlag++

wg.Add(1)

go getDownTs(mapList, threadNum, logFile, downedFile, filePath, mergerFolderPath)

}

wg.Wait()

fmt.Println("Thanks for Buddha blessed hjxSpider program no bugs,over now")

}

func getDownTs(mapList []map[string]string, threadNumint, fileName, downedFile, downPath, mergerPathstring)bool {

if len(mapList) ==0 {

hlog(fileName, PREFIX_ERROR, "the thread-num:", strconv.Itoa(threadNum), "No m3u8 file could be down")

wg.Done()

return false

  }

for _, indexMap :=range mapList {

for title, index :=range indexMap {

lock.Lock()

isDown :=cheakDown(downedFile, title)

lock.Unlock()

threadWT :="ThreadNum:" +strconv.Itoa(threadNum)

if isDown {

hlog(fileName, PREFIX_WARNING, threadWT, "the title:", title, "Has been downed ")

continue

        }

downLoadTsFolder := downPath +"/" + title

hlog(fileName, PREFIX_INFO, threadWT, "The file being downloaded is ", title, "Folder:", downLoadTsFolder)

makeMVFolder :=createDir(downLoadTsFolder, fileName)

if !makeMVFolder {

continue

        }

//os.Chdir(downLoadTsFolder)

        indexText :=getURL(index)

if indexText =="" {

continue

        }

indexM3u8, err :=os.OpenFile(downLoadTsFolder+"/index.m3u8", os.O_CREATE|os.O_RDWR, 0644)

if err !=nil {

hlog(fileName, PREFIX_ERROR, threadWT, title, "An unexpected error occurred while creating index.m3u8, the program is about to end")

continue

        }

lineList :=strings.Split(indexText, "\n")

var downTslist []string

        for _, line :=range lineList {

if line !="" &&string([]rune(line)[0]) =="#" {

if line !="#EXTINF:10.041667," {

indexM3u8.WriteString(line +"\n")

}

}else {

lineSplitList :=strings.Split(line, "/")

tsName := lineSplitList[len(lineSplitList)-1]

if tsName =="aaa0.ts" {

continue

              }else {

indexM3u8.WriteString(tsName +"\n")

downTslist =append(downTslist, line)

}

}

}

        hlog(fileName, PREFIX_INFO, threadWT, title, "ts file has been obtained")

lostTsnum :=0

        for _, url :=range downTslist {

if len(url) !=0 {

lineSplitList :=strings.Split(url, "/")

tsName := lineSplitList[len(lineSplitList)-1]

var downtsTimes =0

            loop:

hlog(fileName, PREFIX_INFO, threadWT, title, "The ts being downloaded is:", tsName, " Current downloads:", strconv.Itoa(downtsTimes+1))

res, err :=http.Get(url)

if err !=nil && downtsTimes !=5 {

downtsTimes++

goto loop

}

if downtsTimes ==5 {

hlog(fileName, PREFIX_ERROR, threadWT, "title:", title, "ts:", tsName, "Download failed, lost current ts")

lostTsnum++

continue

              }

f, err :=os.Create(downLoadTsFolder +"/" + tsName)

if err !=nil {

continue

              }

io.Copy(f, res.Body)

time.Sleep(time.Duration(2) *time.Second)

}

}

if lostTsnum >5 {

continue

        }

mergerTs :=exec.Command("cmd", "/C", "ffmpeg", "-i", downLoadTsFolder+"/index.m3u8", "-vcodec", "copy", "-acodec", "copy", "-absf", "aac_adtstoasc", mergerPath+"/"+strings.Replace(title, " ", "", -1)+".mp4")

if err := mergerTs.Run(); err !=nil {

fmt.Println("Error: ", err)

hlog(fileName, PREFIX_ERROR, title, "mergred", mergerPath+"/"+strings.Replace(title, " ", "", -1)+".mp4 file failure")

}else {

hlog(fileName, PREFIX_INFO, title, "aready mergred", mergerPath+"/"+strings.Replace(title, " ", "", -1)+".mp4 success")

lock.Lock()

f, err :=os.OpenFile(downedFile, os.O_CREATE|os.O_APPEND, 0644)

if err !=nil {

hlog(fileName, PREFIX_ERROR, "create", downedFile, "failure")

}

f.WriteString(title)

defer f.Close()

lock.Unlock()

}

defer indexM3u8.Close()

}

}

wg.Done()

return true

}

func cheakDown(cheakDownFile, titlestring, )bool {

f, err :=os.OpenFile(cheakDownFile, os.O_CREATE|os.O_RDONLY, 0644)

if err !=nil {

fmt.Print(err)

}

if err !=nil {

fmt.Print(err)

}

b, err :=ioutil.ReadAll(f)

if err !=nil {

fmt.Print(err)

}

str :=string(b)

if strings.Contains(str, title) {

return true

  }

return false

}

func splitQBLArray(arr []map[string]string, numint64) [][]map[string]string {

max :=int64(len(arr))

var segmens =make([][]map[string]string, 0)

if max < num {

return append(segmens, arr)

}

quantity := max / num

end :=int64(0)

for i :=int64(1); i <= num; i++ {

qu := i * quantity

if i != num {

segmens =append(segmens, arr[i-1+end:qu])

}else {

segmens =append(segmens, arr[i-1+end:])

}

end = qu - i

}

return segmens

}

//get target URL

func getTargetPageURL(PAGEint, fileNamestring) {

for i :=0; i < PAGE; i++ {

//url := TARGET_URL + "/videos/japanese?page=" + strconv.Itoa(i+1)

      url :=TARGET_URL +"/videos/amateur?page=" +strconv.Itoa(i+1)

targetText :=getURL(url)

if targetText =="" {

hlog(fileName, PREFIX_ERROR, "get", url, "An unexpected error,the program is about to exit")

}else {

regURL :=regexp.MustCompile(`href="/video/\d*/"`)

regTargetListURL := regURL.FindAllString(targetText, -1)

regTitle :=regexp.MustCompile(`title="[^"]*"`)

regTargetListTitle := regTitle.FindAllString(targetText, -1)

regTargetListTitle = regTargetListTitle[2 :len(regTargetListTitle)-2]

//var titleURL map[string]string

        for i, indexURL :=range regTargetListURL {

mapValueURL :=TARGET_URL +strings.TrimSpace(strings.Split(indexURL, "\"")[1])

mapKey :=strings.TrimSpace(strings.Split(regTargetListTitle[i], "\"")[1])

indexTarget :=getURL(mapValueURL)

regIndex :=regexp.MustCompile(`<source src="[^"]*"`)

regIndexURL := regIndex.FindString(indexTarget)

regIndexURL ="http:" +strings.TrimSpace(strings.Split(regIndexURL, "\"")[1])

targetlist =append(targetlist, map[string]string{

mapKey: regIndexURL,

            })

}

}

}

}

//get url text

func getURL(targetstring)string {

resp, err :=http.Get(target)

if err !=nil {

return ""

  }

defer resp.Body.Close()

body, err :=ioutil.ReadAll(resp.Body)

if err !=nil {

return ""

  }

return string(body)

}

func createDir(folder, fileNamestring)bool {

if _, err :=os.Stat(folder); err !=nil {

if os.IsNotExist(err) {

os.MkdirAll(folder, os.ModePerm)

hlog(fileName, PREFIX_INFO, "Created", folder, "successfully")

}else {

return false

      }

}else {

hlog(fileName, PREFIX_INFO, "Working path", folder, "already exists")

}

return true

}

func hlog(fileName, PREFIXstring, logStrList ...string) {

lock.Lock()

logFile, err :=os.OpenFile(fileName, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0666)

if err !=nil {

panic(err)

}

defer logFile.Close()

mw :=io.MultiWriter(os.Stdout, logFile)

log.SetOutput(mw)

log.SetPrefix(PREFIX)

logPrint :=""

  for _, logStr :=range logStrList {

logPrint = logPrint + logStr +" "

  }

log.Println(logPrint)

lock.Unlock()

}


有疑问加站长微信联系(非本文作者)

本文来自:简书

感谢作者:石鸟路遇

查看原文:一个简单的爬虫小程序-golang

入群交流(和以上内容无关):加入Go大咖交流群,或添加微信:liuxiaoyan-s 备注:入群;或加QQ群:692541889

668 次点击  
加入收藏 微博
暂无回复
添加一条新回复 (您需要 登录 后才能回复 没有账号 ?)
  • 请尽量让自己的回复能够对别人有帮助
  • 支持 Markdown 格式, **粗体**、~~删除线~~、`单行代码`
  • 支持 @ 本站用户;支持表情(输入 : 提示),见 Emoji cheat sheet
  • 图片支持拖拽、截图粘贴等方式上传