Golang实现词频统计

books1958 · 2015-06-17 20:03:47 · 3786 次点击 · 预计阅读时间 4 分钟 · 大约8小时之前开始浏览

这是一个创建于 2015-06-17 20:03:47 的文章，其中的信息可能已经有所发展或是发生改变。

第一次，站长亲自招 Gopher 了>>>

本例使用golang实现词频统计。步骤：

（1）从文件中读取一篇文章。

（2）统计词频，按单词出现的频率从大到小进行排序。

（3）写入到文件中。

注：任何非英文字母的符号均认为是单词分隔符（即等同于空格）。

效率：使用本程序统计一篇150W单词的文章，大约需要70ms.

1.核心代码：

package wordtest

import (
	"bytes"
	"fmt"
	"io/ioutil"
	"os"
	"runtime"
	"sort"
	"strings"
	"time"
)

//简单的词频统计任务
func CountTestBase(inputFilePath string, outputFilePath string) {
	//时间开始点
	start := time.Now().UnixNano() / 1e6

	//读取文件
	fileData, err := ioutil.ReadFile(inputFilePath)
	CheckError(err, "read file")
	var fileText string = string(fileData)

	//根据CPU核数新开协程
	newRountineCount := runtime.NumCPU()*2 - 1
	runtime.GOMAXPROCS(newRountineCount + 1)
	//切分文件
	parts := splitFileText(fileText, newRountineCount)

	var ch chan map[string]int = make(chan map[string]int, newRountineCount)
	for i := 0; i < newRountineCount; i++ {
		go countTest(parts[i], ch)
	}

	//主线程接收数据
	var totalWordsMap map[string]int = make(map[string]int, 0)
	completeCount := 0
	for {
		receiveData := <-ch
		for k, v := range receiveData {
			totalWordsMap[strings.ToLower(k)] += v
		}
		completeCount++

		if newRountineCount == completeCount {
			break
		}
	}

	//添加进slice，并排序
	list := make(WordCountBeanList, 0)
	for k, v := range totalWordsMap {
		list = append(list, NewWordCountBean(k, v))
	}
	sort.Sort(list)
	//时间结束点
	end := time.Now().UnixNano() / 1e6
	fmt.Printf("time consume:%dms\n", end-start)

	//输出
	wordsCount := list.totalCount()
	var data bytes.Buffer
	data.WriteString(fmt.Sprintf("程序执行：%dms\n", end-start))
	data.WriteString(fmt.Sprintf("文章总单词数：%d\n\n", wordsCount))
	for _, v := range list {
		var percent float64 = 100.0 * float64(v.count) / float64(wordsCount)
		_, err := data.WriteString(fmt.Sprintf("%s: %d, %3.2f%%\n", v.word, v.count, percent))
		CheckError(err, "bytes.Buffer, WriteString")
	}

	err = ioutil.WriteFile(outputFilePath, []byte(data.String()), os.ModePerm)
	CheckError(err, "ioutil.WriteFile")
}

func countTest(text string, ch chan map[string]int) {
	var wordMap map[string]int = make(map[string]int, 0)

	//按字母读取，除26个字母（大小写）之外的所有字符均认为是分隔符
	startIndex := 0
	letterStart := false
	for i, v := range text {
		if (v >= 65 && v <= 90) || (v >= 97 && v <= 122) {
			if !letterStart {
				letterStart = true
				startIndex = i
			}
		} else {
			if letterStart {
				wordMap[text[startIndex:i]]++
				letterStart = false
			}
		}
	}

	//最后一个单词
	if letterStart {
		wordMap[text[startIndex:]]++
	}
	ch <- wordMap
}

//将全文分成n段
func splitFileText(fileText string, n int) []string {
	length := len(fileText)
	parts := make([]string, n)

	lastPostion := 0
	for i := 0; i < n-1; i++ {
		position := length / n * (i + 1)
		for string(fileText[position]) != " " {
			position++
		}

		parts[i] = fileText[lastPostion:position]
		lastPostion = position
	}

	//最后一段
	parts[n-1] = fileText[lastPostion:]
	return parts
}

func CheckError(err error, msg string) {
	if err != nil {
		panic(msg + "," + err.Error())
	}
}

2.一个struct

package wordtest

type WordCountBean struct {
	word  string
	count int
}

func NewWordCountBean(word string, count int) *WordCountBean {
	return &WordCountBean{word, count}
}

type WordCountBeanList []*WordCountBean

func (list WordCountBeanList) Len() int {
	return len(list)
}

func (list WordCountBeanList) Less(i, j int) bool {
	if list[i].count > list[j].count {
		return true
	} else if list[i].count < list[j].count {
		return false
	} else {
		return list[i].word < list[j].word
	}
}

func (list WordCountBeanList) Swap(i, j int) {
	var temp *WordCountBean = list[i]
	list[i] = list[j]
	list[j] = temp
}

func (list WordCountBeanList) totalCount() int {
	totalCount := 0
	for _, v := range list {
		totalCount += v.count
	}

	return totalCount
}

3.主函数：

package main

import (
	"WordsTest/wordtest"
)

func main() {
	inputFilePath := "files/article.txt"
	outputFilePath := "files/hanjun-result.txt"

	wordtest.CountTestBase(inputFilePath, outputFilePath)
}

有疑问加站长微信联系（非本文作者）

本文来自：CSDN博客

感谢作者：books1958

查看原文：Golang实现词频统计

入群交流（和以上内容无关）：加入Go大咖交流群，或添加微信：liuxiaoyan-s 备注：入群；或加QQ群：692541889

3786 次点击

加入收藏微博

收入我的专栏

上一篇：golang websocket的例子

下一篇：golang之sqlserver连接

词频

runtime

接收数据

程序执行

1 回复 | 直到 2018-08-10 10:03:11

RocherKong · #1 · 7年之前

中文如何处理？

添加一条新回复（您需要登录后才能回复没有账号？）

请尽量让自己的回复能够对别人有帮助
支持 Markdown 格式, **粗体**、~~删除线~~、`单行代码`
支持 @ 本站用户；支持表情（输入 : 提示），见 Emoji cheat sheet
图片支持拖拽、截图粘贴等方式上传

关注我

扫码关注领全套学习资料
加入 QQ 群：
- 192706294（已满）
- 731990104（已满）
- 798786647（已满）
- 729884609（已满）
- 977810755（已满）
- 815126783（已满）
- 812540095（已满）
- 1006366459（已满）
- 692541889
加入微信群：liuxiaoyan-s，备注入群
也欢迎加入知识星球 Go粉丝们（免费）

Golang实现词频统计

用户登录

今日阅读排行

一周阅读排行

关注我

Golang实现词频统计

用户登录

今日阅读排行

一周阅读排行

关注我

给该专栏投稿 写篇新文章

收入到我管理的专栏 新建专栏

给该专栏投稿写篇新文章

收入到我管理的专栏新建专栏