golang版本对某网站HLS爬取与聚合

石鸟路遇 · · 769 次点击 · · 开始浏览    
这是一个创建于 的文章,其中的信息可能已经有所发展或是发生改变。

需要系统预装ffmpeg

package main
/*
    @Author: Create by hjx
    @Date:2020-02-03
*/
import (
    "flag"
    "fmt"
    "io"
    "io/ioutil"
    "net/http"
    "os"
    "os/exec"
    "regexp"
    "runtime"
    "strconv"
    "strings"
    "sync"
    "time"
)

const (
    PREFIX         = "hjx—>"
    PREFIX_INFO    = PREFIX + "[INFO]:"
    PREFIX_ERROR   = PREFIX + "[ERROR]:"
    PREFIX_WARNING = PREFIX + "[WARNING]:"
    TARGET_URL     = "https://www.hjxstbserver.xyz/"
    DISK           = "e"
    FOLDER         = "goqbl"
    MERGER_FOLDER  = "MV"
    DOWNLOAD_PAGE  = 5
    SPIDER_USER    = 6
    LOG_FILE       = "spider.log"
    DOWNED_FILE    = "downed.log"
    IS_SUE         = false
)

var lock sync.Mutex
var wirtelock sync.Mutex
var wg sync.WaitGroup
var targetChan = make(chan map[string]string)

var disk = flag.String("d", DISK, "Disk name of working path")
var folder = flag.String("f", FOLDER, "Folder name of working path")
var mergerFolder = flag.String("m", MERGER_FOLDER, "Folder name of the merger ts")
var page = flag.Int("p", DOWNLOAD_PAGE, "Download qbl page nums")
var userNum = flag.Int("n", SPIDER_USER, "Concurrent number")

func main() {
    flag.Parse()
    runtime.GOMAXPROCS(runtime.NumCPU())
    filePath := *disk + ":/" + *folder
    mergerFolderPath := filePath + "/" + *mergerFolder
    logFile := filePath + "/" + LOG_FILE
    downedFile := filePath + "/" + DOWNED_FILE
    makeWorkFolder := createDir(mergerFolderPath, logFile)
    if makeWorkFolder == false {
        hlog(logFile, PREFIX_ERROR, "An unexpected error occurred while creating the folder", mergerFolderPath, "the program is about to exit")
        os.Exit(1)
    }
    hlog(logFile, PREFIX_INFO, "Buddha bless hjxSpider program no bugs,Start now")
    wg.Add(1)
    go func() {
        getTargetPageURL(*page, logFile, downedFile)
        wg.Done()
    }()
    for i := 0; i < *userNum; i++ {
        wg.Add(1)
        go func(threadNum int) {
            getDownTs(threadNum, logFile, downedFile, filePath, mergerFolderPath)
            wg.Done()
        }(i)
    }
    wg.Wait()
    hlog(logFile, PREFIX_INFO, "Thanks for Buddha bless,End now")
}

func getDownTs(threadNum int, fileName, downedFile, downPath, mergerPath string) {
    for indexMap := range targetChan {
        for title, index := range indexMap {
            threadWT := "ThreadNum:" + strconv.Itoa(threadNum+1)
            downLoadTsFolder := downPath + "/" + title
            hlog(fileName, PREFIX_INFO, threadWT, "The file being downloaded is ", title, "Folder:", downLoadTsFolder)
            makeMVFolder := createDir(downLoadTsFolder, fileName)
            if !makeMVFolder {
                continue
            }
            //os.Chdir(downLoadTsFolder)
            indexText := getURL(index)
            if indexText == "" {
                continue
            }
            indexM3u8, err := os.OpenFile(downLoadTsFolder+"/index.m3u8", os.O_CREATE|os.O_RDWR, 0644)
            if err != nil {
                hlog(fileName, PREFIX_ERROR, threadWT, title, "An unexpected error occurred while creating index.m3u8, the program is about to end")
                indexM3u8.Close()
                continue
            }
            lineList := strings.Split(indexText, "\n")
            var downTslist []string
            for _, line := range lineList {
                if line != "" && string([]rune(line)[0]) == "#" {
                    if line != "#EXTINF:10.041667," {
                        indexM3u8.WriteString(line + "\n")
                    }
                } else {
                    lineSplitList := strings.Split(line, "/")
                    tsName := lineSplitList[len(lineSplitList)-1]
                    if tsName == "aaa0.ts" {
                        continue
                    } else {
                        indexM3u8.WriteString(tsName + "\n")
                        downTslist = append(downTslist, line)
                    }
                }
            }
            ////下载失败5次 return false
            hlog(fileName, PREFIX_INFO, threadWT, title, "ts file has been obtained")
            lostTsnum := 0
            for _, url := range downTslist {
                if len(url) != 0 {
                    lineSplitList := strings.Split(url, "/")
                    tsName := lineSplitList[len(lineSplitList)-1]
                    var downtsTimes = 0
                loop:
                    hlog(fileName, PREFIX_INFO, threadWT, title, "The ts being downloaded is:", tsName, " Current downloads:", strconv.Itoa(downtsTimes+1))
                    res, err := http.Get(url)
                    if err != nil && downtsTimes != 5 {
                        downtsTimes++
                        goto loop
                    }
                    if downtsTimes == 5 {
                        hlog(fileName, PREFIX_ERROR, threadWT, "title:", title, "ts:", tsName, "Download failed, lost current ts")
                        lostTsnum++
                        continue
                    }
                    f, err := os.Create(downLoadTsFolder + "/" + tsName)
                    if err != nil {
                        res.Body.Close()
                        continue
                    }
                    io.Copy(f, res.Body)
                    res.Body.Close()
                    time.Sleep(time.Duration(2) * time.Second)
                }
            }
            if lostTsnum > 5 {
                continue
            }
            mergerTs := exec.Command("cmd", "/C", "ffmpeg", "-i", downLoadTsFolder+"/index.m3u8", "-vcodec", "copy", "-acodec", "copy", "-absf", "aac_adtstoasc", mergerPath+"/"+strings.Replace(title, " ", "", -1)+".mp4")
            if err := mergerTs.Run(); err != nil {
                fmt.Println("Error: ", err)
                hlog(fileName, PREFIX_ERROR, title, "mergred", mergerPath+"/"+strings.Replace(title, " ", "", -1)+".mp4 file failure")
            } else {
                hlog(fileName, PREFIX_INFO, title, "aready mergred", mergerPath+"/"+strings.Replace(title, " ", "", -1)+".mp4 success")
                wirtelock.Lock()
                f, err := os.OpenFile(downedFile, os.O_CREATE|os.O_APPEND, 0644)
                if err != nil {
                    hlog(fileName, PREFIX_ERROR, "create", downedFile, "failure")
                }
                f.WriteString(title)
                f.Close()
                wirtelock.Unlock()
            }
            indexM3u8.Close()
        }
    }
    wg.Done()
}

//get target URL
func getTargetPageURL(PAGE int, fileName, downedFile string) {
    suffix := "/videos/japanese?page="
    if IS_SUE {
        suffix = "/videos/amateur?page="
    }
    for i := 0; i < PAGE; i++ {
        url := TARGET_URL + suffix + strconv.Itoa(i+1)
        targetText := getURL(url)
        if targetText == "" {
            hlog(fileName, PREFIX_ERROR, "get", url, "An unexpected error,the program is about to exit")
        } else {
            regURL := regexp.MustCompile(`href="/video/\d*/"`)
            regTargetListURL := regURL.FindAllString(targetText, -1)
            regTitle := regexp.MustCompile(`title="[^"]*"`)
            regTargetListTitle := regTitle.FindAllString(targetText, -1)
            regTargetListTitle = regTargetListTitle[2 : len(regTargetListTitle)-2]
            //var titleURL map[string]string
            for i, indexURL := range regTargetListURL {
                mapValueURL := TARGET_URL + strings.TrimSpace(strings.Split(indexURL, "\"")[1])
                mapKey := strings.TrimSpace(strings.Split(regTargetListTitle[i], "\"")[1])
                isDown := cheakDown(downedFile, mapKey)
                if isDown {
                    hlog(fileName, PREFIX_WARNING, mapKey, "Already downloaded, no need to download again")
                    continue
                }
                indexTarget := getURL(mapValueURL)
                regIndex := regexp.MustCompile(`<source src="[^"]*"`)
                regIndexURL := regIndex.FindString(indexTarget)
                regIndexURL = "http:" + strings.TrimSpace(strings.Split(regIndexURL, "\"")[1])
                targetChan <- map[string]string{
                    mapKey: regIndexURL,
                }
            }
        }
    }
    close(targetChan)
}

func cheakDown(cheakDownFile, title string, ) bool {
    f, err := os.OpenFile(cheakDownFile, os.O_CREATE|os.O_RDONLY, 0644)
    if err != nil {
        fmt.Print(err)
    }
    b, err := ioutil.ReadAll(f)
    if err != nil {
        fmt.Print(err)
    }
    str := string(b)
    if strings.Contains(str, title) {
        f.Close()
        return true
    }
    f.Close()
    return false
}

//get url text
func getURL(target string) string {
    resp, err := http.Get(target)
    if err != nil {
        return ""
    }
    body, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        resp.Body.Close()
        return ""
    }
    resp.Body.Close()
    return string(body)
}

func createDir(folder, fileName string) bool {
    if _, err := os.Stat(folder); err != nil {
        if os.IsNotExist(err) {
            os.MkdirAll(folder, os.ModePerm)
            hlog(fileName, PREFIX_INFO, "Created", folder, "successfully")
        } else {
            return false
        }
    } else {
        hlog(fileName, PREFIX_INFO, "Working path", folder, "already exists")
    }
    return true
}

func hlog(fileName, prefix string, logStrList ...string) {
    lock.Lock()
    timeStr := time.Now().Format("2006-01-02 15:04:05")
    logFile, err := os.OpenFile(fileName, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0666)
    if err != nil {
        panic(err)
    }
    logPrint := prefix + timeStr
    for _, logStr := range logStrList {
        logPrint = logPrint + " " + logStr
    }
    logFile.WriteString(logPrint + "\n")
    fmt.Println(logPrint)
    logFile.Close()
    lock.Unlock()
}



有疑问加站长微信联系(非本文作者)

本文来自:简书

感谢作者:石鸟路遇

查看原文:golang版本对某网站HLS爬取与聚合

入群交流(和以上内容无关):加入Go大咖交流群,或添加微信:liuxiaoyan-s 备注:入群;或加QQ群:692541889

769 次点击  
加入收藏 微博
暂无回复
添加一条新回复 (您需要 登录 后才能回复 没有账号 ?)
  • 请尽量让自己的回复能够对别人有帮助
  • 支持 Markdown 格式, **粗体**、~~删除线~~、`单行代码`
  • 支持 @ 本站用户;支持表情(输入 : 提示),见 Emoji cheat sheet
  • 图片支持拖拽、截图粘贴等方式上传