package main
/*
* 中文编码问题
*/
import (
"errors"
"flag"
"fmt"
query "github.com/PuerkitoBio/goquery"
"golang.org/x/text/encoding/simplifiedchinese"
"io/ioutil"
"net/http"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
)
var (
np = runtime.NumCPU()
_ = runtime.GOMAXPROCS(np)
)
var wg sync.WaitGroup
type Folder struct {
Url string
Dir string
}
type File struct {
Url string
Dir string
Name string
}
func checkErr(err error) {
if err != nil {
fmt.Printf("%v\n", err.Error())
os.Exit(1)
}
}
func decodeToGBK(text string) (string, error) {
dst := make([]byte, len(text)*2)
tr := simplifiedchinese.GB18030.NewDecoder()
nDst, _, err := tr.Transform(dst, []byte(text), true)
if err != nil {
return text, err
}
return string(dst[:nDst]), nil
}
func printEach(index int, item *query.Selection) {
fmt.Println("Selection: ", item.Text())
}
func isDir(path string) bool {
return strings.HasSuffix(path, "/")
}
func makeFolder(item *query.Selection, url, dir string) (f *Folder, err error) {
tx := item.Text()
href, ok := item.Attr("href")
name, err := decodeToGBK(tx)
if err != nil {
return
}
if !ok {
err = errors.New("makeFolder : " + tx + " href属性不存在")
return
}
f = &Folder{Url: url + href, Dir: filepath.Join(dir, name)}
return
}
func makeFile(item *query.Selection, url, dir string) (f *File, err error) {
tx := item.Text()
href, ok := item.Attr("href")
if !ok {
err = errors.New("makeFile : " + tx + " href属性不存在")
return
}
name, err := decodeToGBK(tx)
if err != nil {
return
}
f = &File{Url: url + href, Dir: dir, Name: name}
return
}
func crawl(url, localDir string) {
doc, err := query.NewDocument(url)
// checkErr(err)
if err != nil {
fmt.Printf("%v\n", err.Error())
return
}
items := doc.Find("a")
dir := localDir
if !strings.HasSuffix(url, "/") {
url += "/"
}
crawlEach := func(i int, item *query.Selection) {
tx := item.Text()
if isDir(tx) {
folder, err := makeFolder(item, url, dir)
if err != nil {
fmt.Printf("%v\n", err.Error())
return
}
wg.Add(1)
go crawlFolder(folder)
} else {
file, err := makeFile(item, url, dir)
if err != nil {
fmt.Printf("%v\n", err.Error())
return
}
download(file)
}
}
items.Each(crawlEach)
}
func download(file *File) {
dir := file.Dir
url := file.Url
name := file.Name
if err := os.MkdirAll(dir, os.ModePerm); os.IsExist(err) {
fmt.Printf("%x is exist\n", dir)
} else {
os.Chmod(dir, os.ModePerm)
}
resp, err := http.Get(url)
if err != nil {
fmt.Printf("%v\n", err.Error())
return
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Printf("%v\n", err.Error())
return
}
fp := string([]rune(filepath.Join(dir, name)))
err = ioutil.WriteFile(fp, body, 0777)
if err != nil {
fmt.Printf("%v fp:[%v]\n", err.Error(), fp)
return
}
fmt.Printf("Download: %+v\n", file)
}
func crawlFolder(folder *Folder) {
url := folder.Url
dir := folder.Dir
crawl(url, dir)
wg.Done()
}
func main() {
host := flag.String("host", "http://localhost:8000", "HTTP服务地址Host")
location := flag.String("locate", "E:/Crawler下载文件", "本地文件系统绝对路径")
flag.Parse()
crawl(*host, *location)
wg.Wait()
}
有疑问加站长微信联系(非本文作者)