package main
import (
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"strings"
"sync"
"golang.org/x/net/html"
)
//timer
var (
ground sync.WaitGroup
str string = "https://docs.hacknode.org/gopl-zh/"
)
func init() {
defer func() {
if err := recover(); err != nil {
log.Fatal("recover error is :", err)
}
}()
}
//CreatFile is func to get infomation
func CreatFile(bt []byte, i string) {
f, err := os.OpenFile("F:/MyGo/src/waitground_user/url"+i+".txt", os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
log.Fatal(err)
}
defer f.Close()
_, err = f.Write([]byte(bt))
if err != nil {
log.Fatal(err)
}
}
//GetURLInfomation is get url infomation
func GetURLInfomation(URL string, ch chan int) (bt []byte) {
resp, err := http.Get(URL)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Fatal("Can't connect")
}
bt, err = ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
ch <- 1
return bt
}
//GetURLInfomationAdress is a func get URL infomation
func GetURLInfomationAdress(URL string) []string {
resp, err := http.Get(URL)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Fatal("Can't connect:", URL)
}
//开始节点处理
doc, err := html.Parse(resp.Body)
if err != nil {
log.Fatal(err)
}
var links []string
ForOneNode := func(n *html.Node) { //单次节点处理
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key != "href" {
continue
}
link, err := resp.Request.URL.Parse(a.Val)
if err != nil {
log.Fatal(err)
}
if CheckURL(link.String(), links) {
links = append(links, link.String()) //这条语句可以改成并行获取URL地址内容
}
}
}
}
ForEachNode(doc, ForOneNode, nil)
return links
}
//ForEachNode is 广度优先遍历
func ForEachNode(n *html.Node, pre, post func(n *html.Node)) {
if pre != nil {
pre(n)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
ForEachNode(c, pre, post)
}
if post != nil {
post(n)
}
}
//正则表达式检查
func checkRegexp(cont string, reg string, style int) (result interface{}) {
check := regexp.MustCompile(reg)
switch style {
case 0:
result = check.FindString(cont)
case 1:
result = check.FindAllString(cont, -1)
default:
result = check.FindAll([]byte(cont), -1)
}
return
}
//CheckURL is check the repeated fields
func CheckURL(link string, links []string) bool {
bl := true
for _, str := range links {
if str == link {
bl = false
break
}
}
return bl
}
func main() {
bt := GetURLInfomationAdress(str) //第一次运行,获取所有的标签链接地址
fmt.Println("first finish")
// fmt.Println(bt[1])
// f := GetURLInfomation(bt[1])
// CreatFile(f, strconv.Itoa(1))
ch := make(chan int)
for _, t := range bt {
t := t
go func() { //遍历所有地址,获取地址内容
fname := strings.Split(t, "/")
ff := fname[len(fname)-1]
fmt.Println("地址:", t)
fmt.Println(ff)
ft := strings.Split(ff, ".")
fft := ft[0]
fmt.Println(fft) //进行地址处理,取页面名当做文件名
p := GetURLInfomation(t, ch)
CreatFile([]byte(p), fft)
}()
}
for range ch {
<-ch
}
}
有疑问加站长微信联系(非本文作者)