GO语言学习群721929980
**常用正则规则**
Go语言标准库内建提供了regexp包
- . 匹配除换行符以外的任意字符
- \w 匹配字母或数字或下划线或汉字
- \s 匹配任意的空白符
- \d 匹配数字
- \b 匹配单词的开始或结束
- ^ 匹配字符串的开始
- $ 匹配字符串的结束
- 重复零次或更多次
- 重复一次或更多次
- ? 重复零次或一次
- {n} 重复n次
- {n,} 重复n次或更多次
- {n,m} 重复n到m次
- 捕获 (exp) 匹配exp,并捕获文本到自动命名的组里
- (?<name>exp) 匹配exp,并捕获文本到名称为name的组里,也可以写成(?'name'exp)
- (?:exp) 匹配exp,不捕获匹配的文本,也不给此分组分配组号
**导入包**
```
import (
"net/http"
"fmt"
"io/ioutil"
"regexp"
)
```
**定义正则表达式**
```
var (
//邮箱
reQQEmail = `(\d+)@qq.com`
reEmail = `\w+@\w+\.\w+(\.\w+)?`
//超链接
//<a href="http://news.baidu.com/ns?cl=2&rn=20&tn=news&word=%C1%F4%CF%C2%D3%CA%CF%E4%20%B5%BA%B9%FA"
reLinkBad = `<a[\s\S]*?href="(https?://[\s\S]+?)"`
reLink = `href="(https?://[\s\S]+?)"`
//手机号
//13x xxxx xxxx
rePhone = `1[345789]\d\s?\d{4}\s?\d{4}`
//身份证号
//123456 1990 0817 123X
reIdcard = `[123456]\d{5}((19\d{2})|(20[01]\d))((0[1-9])|(1[012]))((0[1-9])|([12]\d)|(3[01]))\d{3}[\dX]`
//图片链接
//"http://img2.imgtn.bdimg.com/it/u=2403021088,4222830812&fm=26&gp=0.jpg"
reImg = `"(https?://[^"]+?(\.((jpg)|(jpeg)|(png)|(gif)|(bmp)|(svg)|(swf)|(ico))))"`
)
```
**预定义错误处理函数**
```
func HandleError(err error, why string) {
if err != nil {
fmt.Print(why, err)
}
}
```
**获得页面html方法封装**
```
func GetPageStr(url string) (pageStr string) {
resp, err := http.Get(url)
HandleError(err, "http.Get url")
defer resp.Body.Close()
pageBytes, err := ioutil.ReadAll(resp.Body)
HandleError(err, "ioutil.ReadAll")
pageStr = string(pageBytes)
return pageStr
}
```
**爬邮箱**
```
func SpiderEmail() {
pageStr := GetPageStr("http://tieba.baidu.com/p/2544042204")
pageStr += "ximendong@21centry.com.cn"
//fmt.Println(pageStr)
re := regexp.MustCompile(reEmail)
results := re.FindAllStringSubmatch(pageStr, -1)
for _, result := range results {
//fmt.Printf("email=%s,qq=%s\n",result[0],result[1])
fmt.Println(result)
}
}
```
**爬超链接**
```
func SpiderLink() {
pageStr := GetPageStr("http://www.baidu.com/s?wd=%E7%95%99%E4%B8%8B%E9%82%AE%E7%AE%B1%20%E5%B2%9B%E5%9B%BD")
//fmt.Println(pageStr)
re := regexp.MustCompile(reLink)
results := re.FindAllStringSubmatch(pageStr, -1)
fmt.Printf("共找到%d条结果:\n", len(results))
for _, result := range results {
//fmt.Printf("email=%s,qq=%s\n",result[0],result[1])
fmt.Println(result[1])
fmt.Println()
}
}
```
**爬手机**
```
func SpiderMobilePhone() {
pageStr := GetPageStr("http://www.zhaohaowang.com/aspx/zhw/index.html?CityId=1#BJ=05698")
//fmt.Println(pageStr)
re := regexp.MustCompile(rePhone)
results := re.FindAllStringSubmatch(pageStr, -1)
fmt.Printf("共找到%d条结果:\n", len(results))
for _, result := range results {
fmt.Println(result)
fmt.Println()
}
}
```
**爬身份证**
```
func SpiderIdcard() {
pageStr := GetPageStr("http://sfz.ckd.cc/")
//fmt.Println(pageStr)
re := regexp.MustCompile(reIdcard)
results := re.FindAllStringSubmatch(pageStr, -1)
fmt.Printf("共找到%d条结果:\n", len(results))
for _, result := range results {
fmt.Println(result[0])
fmt.Println()
}
}
```
**爬图片**
```
func SpiderImg() {
pageStr := GetPageStr("http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E7%BE%8E%E5%A5%B3")
//fmt.Println(pageStr)
re := regexp.MustCompile(reImg)
results := re.FindAllStringSubmatch(pageStr, -1)
fmt.Printf("共找到%d条结果:\n", len(results))
for _, result := range results {
fmt.Println(result[1])
fmt.Println()
}
}
```
更多实战:https://blog.csdn.net/u010986776/article/category/8095305
GO语言学习群721929980