// 目前只是实现了对url下所有子url的抓取。
//todo html
package main
import (
"fmt"
"io/ioutil"
// "html/template"
"bytes"
"net/http"
"strings"
)
var subSlice []byte
var arrIndex int
//func analysisHtmlSlice(slice []byte, []string) {
//}
func exist(url string, arrurl []string) bool {
for i := 0; i < len(arrurl); i++ {
if strings.EqualFold(url, arrurl[i]) {
return true
}
}
return false
}
func isSubOf(suburl string, url string) bool {
return strings.Contains(suburl, url)
}
func findHref(slice []byte) (offset int, urlSlice []byte) {
index := bytes.Index(slice[:], subSlice)
if index == -1 {
offset = -1
// urlSlice =
return
}
index += 6
indexEnd := bytes.IndexByte(slice[index:], '"')
if indexEnd == -1 {
offset = -1
// urlSlice = -1;
return
}
indexEnd += index
offset = indexEnd + 1
urlSlice = slice[index:indexEnd]
return
}
func recursiveAnalysisUrl(url string, urlArr []string) {
r, err := http.Get(url)
begin := len(urlArr)
fmt.Printf("analysis url:%s, 当前位置:%d\n", url, begin)
if err != nil {
fmt.Printf("%@", err)
return
}
defer r.Body.Close()
slice, _ := ioutil.ReadAll(r.Body)
offset := 0
for {
var urlstr []byte
suboffset, urlstr := findHref(slice[offset:])
if suboffset == -1 {
break
}
offset += suboffset
if !exist(string(urlstr), urlArr) && isSubOf(string(urlstr), "http://www.dreamingwish.com/") && !strings.EqualFold(string(urlstr), "http://www.dreamingwish.com/") {
// fmt.Printf("around")
// urlArr[arrIndex] = string(urlstr)
urlArr = append(urlArr, string(urlstr))
// fmt.Printf("around")
arrIndex++
fmt.Printf("%d:%s\n", arrIndex-1, string(urlstr))
}
}
fmt.Printf("begin:%d, end:%d", begin, len(urlArr))
for i := begin; i < len(urlArr); i++ {
fmt.Printf("这是第%d个url 到第%d个url-------", begin, len(urlArr))
recursiveAnalysisUrl(urlArr[i], urlArr)
}
}
func main() {
subSlice = []byte{'h', 'r', 'e', 'f', '=', '"', 'h', 't', 't', 'p'}
arrUrl := make([]string, 0, 50)
fmt.Printf("当前位置%d\n", len(arrUrl))
arrIndex := 0
recursiveAnalysisUrl("http://www.dreamingwish.com/", arrUrl)
for i := 0; i < arrIndex; i++ {
fmt.Println("%s", arrUrl[arrIndex])
}
}
有疑问加站长微信联系(非本文作者)