package main
import(
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strings"
"os"
"image/png"
"image/jpeg"
"image"
)
var filePath = "F:/girls/"
func fileExist(fileName string) bool {
if _,ok:=os.Stat(fileName);ok == nil{
return true
}
return false
}
func Substr(str string, start, length int) string {
rs := []rune(str)
rl := len(rs)
end := 0
if start < 0 {
start = rl - 1 + start
}
end = start + length
if start > end {
start, end = end, start
}
if start < 0 {
start = 0
}
if start > rl {
start = rl
}
if end < 0 {
end = 0
}
if end > rl {
end = rl
}
return string(rs[start:end])
}
func getImageList(url string, c chan int){
fmt.Println("get page link url==>", url)
body:=getUrl(url)
if body == ""{
return
}
reg := regexp.MustCompile("http://www.meizitu.com/a/[0-9]+.html")
links:=reg.FindAllString(body, -1)
getImageLink(links, c)
}
func getImageLink(links []string, c chan int){
for _, uri := range links{
fmt.Println("Get images url, page link==>", uri)
body:=getUrl(uri)
if ""==body{
return
}
reg:=regexp.MustCompile("http://pic.meizitu.com/wp-content/uploads/[^\\.]+\\.(jpg|png|gif)")
images:=reg.FindAllString(body, -1)
downloadImage(images)
}
c <- 1
}
func downloadImage(images []string){
for _,v:=range images{
fmt.Println("Download image, url==>", v)
imageType:=Substr(v, -2, 3)
resp,ok:=http.Get(v)
if nil!=ok{
continue
}
defer resp.Body.Close()
flag:=false
var iImage image.Image
content,ok:=ioutil.ReadAll(resp.Body)
body:=string(content)
if imageType=="jpg"{
iImage,ok=jpeg.Decode(strings.NewReader(body))
flag=true
if nil!=ok{
continue
}
} else if imageType == "png"{
iImage,ok=png.Decode(strings.NewReader(body))
flag=true
if nil!=ok{
continue
}
}
if flag{
rect:=iImage.Bounds()
if rect.Max.X < 200 || rect.Max.Y < 200{
//只下载大图,小图跳过
fmt.Println("Skip download image, url ==>", v)
continue
}
}
// body:=getUrl(v)
if nil!=ok || "" == body{
fmt.Println("content is null")
continue
}
paths:=strings.Split(v,"/")
len:=len(paths)
fileName:=filePath + paths[len-4]+ paths[len-3]+ paths[len-2] + paths[len-1]
if fileExist(fileName){
continue
}
f,ok:=os.Create(fileName)
if ok!=nil{
fmt.Println("open file error")
return
}
defer f.Close()
f.WriteString(body)
}
}
func getUrl(url string) string{
resp,ok:=http.Get(url)
if nil!=ok{
return ""
}
defer resp.Body.Close()
str,ok:=ioutil.ReadAll(resp.Body)
if ok!=nil{
return ""
}
return string(str)
}
func main() {
fms:="http://www.meizitu.com/a/sifang_5_%d.html"
max_page:=10
cur_page:=1
offset:=cur_page+max_page
ch:=make(chan int, max_page)
for ;cur_page<offset;cur_page++{
go func(page int){
url:=fmt.Sprintf(fms, page)
fmt.Println("Parse url:",url)
getImageList(url, ch)
}(cur_page)
}
sum:=0
forEnd:
for{
select{
case <- ch:
sum+=1
if sum == max_page{
break forEnd
}
}
}
fmt.Println("done!")
}
重构的代码:
package main
import(
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strings"
"os"
"image/png"
"image/jpeg"
"image/gif"
"image"
"errors"
)
type Config struct{
SavePath string
MinWidth int
MinHeight int
Overwrite bool
MaxPage int
StartPage int
}
func NewConfig(savePath string, minWidth, minHeight, maxPage, startPage int, overwrite bool) *Config{
return &Config{
savePath,
minWidth,
minHeight,
overwrite,
maxPage,
startPage,
}
}
const (
PAGE_URL string = "http://www.meizitu.com/a/sifang_5_%d.html"
IMAGE_LIST_LINKS string = "http://www.meizitu.com/a/[0-9]+.html"
IMAGE_IMAGE_LINKS string = "http://pic.meizitu.com/wp-content/uploads/[^\\.]+\\.(jpg|png|gif)"
)
type Webpage struct {
Config *Config
}
func NewWebpage(config *Config) *Webpage{
return &Webpage{Config: config}
}
func (self *Webpage) ParsePage(url string) []string{
offset := self.Config.StartPage + self.Config.MaxPage
var urls []string
for curPage := self.Config.StartPage; curPage < offset; curPage ++{
urls = append(urls, fmt.Sprintf(url, curPage))
}
return urls
}
func (self *Webpage) Get(url string) (body string){
resp,ok:=http.Get(url)
if nil!=ok{
return ""
}
defer resp.Body.Close()
str,ok:=ioutil.ReadAll(resp.Body)
if ok!=nil{
return ""
}
return string(str)
}
func (self *Webpage) ParseUrl(url, pattern string) (links []string){
fmt.Println("Parse url ==>", url)
body := self.Get(url)
if "" == body{
return []string{}
}
reg := regexp.MustCompile(pattern)
return reg.FindAllString(body, -1)
}
func (self *Webpage) GetSaveName(url string) string{
paths:=strings.Split(url, "/")
len:=len(paths)
fileName:=self.Config.SavePath + paths[len-4]+ paths[len-3]+ paths[len-2] + paths[len-1]
return fileName
}
func (self *Webpage) Download(urls []string) {
for _,url := range urls{
fmt.Println("Start download image from url ==>", url)
fileName := self.GetSaveName(url)
if self.FileExist(fileName) && !self.Config.Overwrite{
fmt.Println("Image already exists, skip download ==>", url)
continue
}
body := self.Get(url)
if "" == body{
continue
}
if !self.CheckSize(body, self.GetExt(url)){
fmt.Println("Image size too small, skip download ==>", url)
continue
}
if !self.SaveImage(body, fileName){
fmt.Println("Save image failed ==>", url)
}
}
}
func (self *Webpage) SaveImage(body, name string) bool {
f,ok:=os.Create(name)
if ok!=nil{
fmt.Println("open file error")
return false
}
defer f.Close()
if _,err:=f.WriteString(body);err == nil{
return true
}
return false
}
func (self *Webpage) GetExt(url string) string{
if url == ""{
return ""
}
temp := strings.Split(url, ".")
return temp[len(temp) - 1]
}
func (self *Webpage) CheckSize(body, ext string) bool {
if self.Config.MinWidth <= 0 && self.Config.MinHeight <= 0 {
return true
}
var iImage image.Image
var ok error = errors.New("Unknow image type")
switch ext {
case "jpg":
iImage,ok=jpeg.Decode(strings.NewReader(body))
case "png":
iImage,ok=png.Decode(strings.NewReader(body))
case "gif":
iImage,ok=gif.Decode(strings.NewReader(body))
default:
fmt.Println("Unknow image format")
return false
}
if ok == nil {
rect := iImage.Bounds()
if self.Config.MinWidth <= rect.Max.X && self.Config.MinHeight <= rect.Max.Y{
return true
}
}
return false
}
func (self *Webpage) FileExist(name string) bool{
if _, ok := os.Stat(name); ok == nil {
return true
}
return false
}
func (self *Webpage) RunTask(){
urls:=self.ParsePage(PAGE_URL)
sum:=0
l:=len(urls)
c:=make(chan int, l)
for _, url := range urls{
go func(url string){
links := self.ParseUrl(url, IMAGE_LIST_LINKS)
for _,v := range links{
uris := self.ParseUrl(v, IMAGE_IMAGE_LINKS)
self.Download(uris)
}
c <- 1
}(url)
}
forEnd:
for {
select{
case <-c:
sum ++;
if sum == l{
break forEnd
}
}
}
}
func main() {
config := NewConfig(
"F:/girls/",
400,
400,
1,
11,
false,
)
webpage := NewWebpage(config)
webpage.RunTask()
fmt.Println("done!")
}