前言
在游戏设计中一个最基本的功能就是处理屏蔽字、敏感字,至于为什么这个需求这么重要?你懂的。在网上搜了很多资料,发现主要有三种实现方式:
对于输入的一句话message,
- 1、循环使用所有的屏蔽词去message中查找,看是否存在,如果存在则判定为敏感词。
优点:简单,只要学过几个月编程的都会;
缺点:效率太低,而且效果不是太好; - 2、将有共同起始内容的屏蔽词分为一组,然后再使用方式1。
优点:比方案1效率高一些;
缺点:效率仍然很低,而且效果太差; - 3、使用DFA算法。
优点:效率高,内存消耗低;
缺点:与前两者相比,实现复杂。
综上:为了应对越来越多的敏感词,寻找一个高效率的敏感词过滤算法就摆在了各个程序员的面前。而DFA是所有方法里面效率最高的。
在网上看了很多现有的实现方式,觉得存在各种各样的问题,于是自己就决定来写一个。
数据结构
算法与数据结构是密不可分的。数据结构决定了在其之上的算法。DFA算法采用了Trie树这种数据结构来存储数据。
Trie树的详细介绍
算法实现
Golang Version
- trieNode类型的定义
const (
INIT_TRIE_CHILDREN_NUM = 128 // Since we need to deal all kinds of language, so we use 128 instead of 26
)
// trieNode data structure
// trieNode itself doesn't have any value. The value is represented on the path
type trieNode struct {
// if this node is the end of a word
isEndOfWord bool
// the collection of children of this node
children map[rune]*trieNode
}
// Create new trieNode
func newtrieNode() *trieNode {
return &trieNode{
isEndOfWord: false,
children: make(map[rune]*trieNode, INIT_TRIE_CHILDREN_NUM),
}
}
- 匹配时上、下界索引的定义
// Match index object
type matchIndex struct {
start int // start index
end int // end index
}
// Construct from scratch
func newMatchIndex(start, end int) *matchIndex {
return &matchIndex{
start: start,
end: end,
}
}
// Construct from existing match index object
func buildMatchIndex(obj *matchIndex) *matchIndex {
return &matchIndex{
start: obj.start,
end: obj.end,
}
}
- 核心算法实现
// dfa util
type DFAUtil struct {
// The root node
root *trieNode
}
func (this *DFAUtil) insertWord(word []rune) {
currNode := this.root
for _, c := range word {
if cildNode, exist := currNode.children[c]; !exist {
cildNode = newtrieNode()
currNode.children[c] = cildNode
currNode = cildNode
} else {
currNode = cildNode
}
}
currNode.isEndOfWord = true
}
// Check if there is any word in the trie that starts with the given prefix.
func (this *DFAUtil) startsWith(prefix []rune) bool {
currNode := this.root
for _, c := range prefix {
if cildNode, exist := currNode.children[c]; !exist {
return false
} else {
currNode = cildNode
}
}
return true
}
// Searc and make sure if a word is existed in the underlying trie.
func (this *DFAUtil) searcWord(word []rune) bool {
currNode := this.root
for _, c := range word {
if cildNode, exist := currNode.children[c]; !exist {
return false
} else {
currNode = cildNode
}
}
return currNode.isEndOfWord
}
// Searc a whole sentence and get all the matcing words and their indices
// Return:
// A list of all the matc index object
func (this *DFAUtil) searcSentence(sentence string) (matchIndexList []*matchIndex) {
start, end := 0, 1
sentenceRuneList := []rune(sentence)
// Iterate the sentence from the beginning to the end.
startsWith := false
for end <= len(sentenceRuneList) {
// Check if a sensitive word starts with word range from [start:end)
// We find the longest possible path
// Then we check any sub word is the sensitive word from long to short
if this.startsWith(sentenceRuneList[start:end]) {
startsWith = true
end += 1
} else {
if startsWith == true {
// Check any sub word is the sensitive word from long to short
for index := end - 1; index > start; index-- {
if this.searcWord(sentenceRuneList[start:index]) {
matchIndexList = append(matchIndexList, newMatchIndex(start, index-1))
break
}
}
}
start, end = end-1, end+1
startsWith = false
}
}
// If finishing not because of unmatching, but reaching the end, we need to
// check if the previous startsWith is true or not.
// If it's true, we need to check if there is any candidate?
if startsWith {
for index := end - 1; index > start; index-- {
if this.searcWord(sentenceRuneList[start:index]) {
matchIndexList = append(matchIndexList, newMatchIndex(start, index-1))
break
}
}
}
return
}
// Judge if input sentence contains some special caracter
// Return:
// Matc or not
func (this *DFAUtil) IsMatch(sentence string) bool {
return len(this.searcSentence(sentence)) > 0
}
// Handle sentence. Use specified caracter to replace those sensitive caracters.
// input: Input sentence
// replaceCh: candidate
// Return:
// Sentence after manipulation
func (this *DFAUtil) HandleWord(sentence string, replaceCh rune) string {
matchIndexList := this.searcSentence(sentence)
if len(matchIndexList) == 0 {
return sentence
}
// Manipulate
sentenceList := []rune(sentence)
for _, matchIndexObj := range matchIndexList {
for index := matchIndexObj.start; index <= matchIndexObj.end; index++ {
sentenceList[index] = replaceCh
}
}
return string(sentenceList)
}
// Create new DfaUtil object
// wordList:word list
func NewDFAUtil(wordList []string) *DFAUtil {
this := &DFAUtil{
root: newtrieNode(),
}
for _, word := range wordList {
wordRuneList := []rune(word)
if len(wordRuneList) > 0 {
this.insertWord(wordRuneList)
}
}
return this
}
- 测试用例
import (
"testing"
)
func TestIsMatch(t *testing.T) {
sensitiveList := []string{"中国", "中国人"}
input := "我来自中国cd"
util := NewDFAUtil(sensitiveList)
if util.IsMatch(input) == false {
t.Errorf("Expected true, but got false")
}
}
func TestHandleWord(t *testing.T) {
sensitiveList := []string{"中国", "中国人"}
input := "我来自中国cd"
util := NewDFAUtil(sensitiveList)
newInput := util.HandleWord(input, '*')
expected := "我来自**cd"
if newInput != expected {
t.Errorf("Expected %s, but got %s", expected, newInput)
}
}
C# Version
- TrieNode类型的定义
/// <summary>
/// TrieNode data structure
/// TrieNode itself doesn't have any value. The value is represented on the path
/// </summary>
internal sealed class TrieNode
{
/// <summary>
/// if this node is the end of a word
/// </summary>
public Boolean IsEndOfWord { get; set; }
/// <summary>
/// the collection of children of this node
/// </summary>
public Dictionary<Char, TrieNode> Children = new Dictionary<Char, TrieNode>();
}
- 匹配时上、下界索引的定义
/// <summary>
/// Match index object
/// </summary>
internal sealed class MatchIndex
{
/// <summary>
/// start index
/// </summary>
public Int32 Start { get; set; }
/// <summary>
/// end index
/// </summary>
public Int32 End { get; set; }
/// <summary>
/// construct from scratch
/// </summary>
/// <param name="start">start index</param>
/// <param name="end">end index</param>
public MatchIndex(Int32 start, Int32 end)
{
this.Start = start;
this.End = end;
}
/// <summary>
/// construct from existing match index object
/// </summary>
/// <param name="other">existing match index object</param>
public MatchIndex(MatchIndex other)
{
this.Start = other.Start;
this.End = other.End;
}
}
- 核心算法实现
/// <summary>
/// Dfa util
/// </summary>
public sealed class DFAUtil
{
/// <summary>
/// The root node
/// </summary>
private readonly TrieNode Root = new TrieNode();
/// <summary>
/// Create new DfaUtil object
/// </summary>
/// <param name="wordList">word list</param>
public DFAUtil(IEnumerable<String> wordList)
{
foreach (var word in wordList)
{
Char[] chArr = word.ToCharArray();
if (chArr.Length > 0)
{
InsertWord(chArr);
}
}
}
private void InsertWord(Char[] word)
{
var currNode = this.Root;
foreach (var ch in word)
{
if (currNode.Children.ContainsKey(ch) == false)
{
var childNode = new TrieNode();
currNode.Children[ch] = childNode;
currNode = childNode;
}
else
{
currNode = currNode.Children[ch];
}
}
currNode.IsEndOfWord = true;
}
/// <summary>
/// Check if there is any word in the trie that starts with the given prefix.
/// </summary>
/// <param name="prefix">prefix</param>
/// <returns></returns>
private Boolean StartsWith(Char[] prefix)
{
var currNode = this.Root;
foreach (var c in prefix)
{
if (currNode.Children.ContainsKey(c) == false)
{
return false;
}
else
{
currNode = currNode.Children[c];
}
}
return true;
}
/// <summary>
/// Search and make sure if a word is existed in the underlying trie.
/// </summary>
/// <param name="word">The char array of a word</param>
/// <returns></returns>
private Boolean SearchWord(Char[] word)
{
var currNode = this.Root;
foreach (var c in word)
{
if (currNode.Children.ContainsKey(c) == false)
{
return false;
}
else
{
currNode = currNode.Children[c];
}
}
return currNode.IsEndOfWord;
}
/// <summary>
/// Search a whole sentence and get all the matching words and their indices
/// </summary>
/// <param name="sentence">Input sentence</param>
/// <returns>A list of all the match index object</returns>
private List<MatchIndex> SearchSentence(String sentence)
{
var start = 0;
var end = 1;
List<MatchIndex> matchIndexList = new List<MatchIndex>();
// Iterate the sentence from the beginning to the end.
var startsWith = false;
Char[] sentenceChArr = sentence.ToCharArray();
while (end <= sentenceChArr.Length)
{
// Check if a sensitive word starts with word range from [start:end)
// We find the longest possible path
// Then we check any sub word is the sensitive word from long to short
if (this.StartsWith(sentenceChArr.Skip(start).Take(end - start).ToArray()))
{
startsWith = true;
end++;
}
else
{
if (startsWith)
{
// Check any sub word is the sensitive word from long to short
for (var index = end - 1; index > start; index--)
{
if (this.SearchWord(sentenceChArr.Skip(start).Take(index - start).ToArray()))
{
matchIndexList.Add(new MatchIndex(start, index - 1));
break;
}
}
}
start = end - 1;
end = end + 1;
startsWith = false;
}
}
// If finishing not because of unmatching, but reaching the end, we need to
// check if the previous startsWith is true or not.
// If it's true, we need to check if there is any candidate?
if (startsWith)
{
for (var index = end - 1; index > start; index--)
{
if (this.SearchWord(sentenceChArr.Skip(start).Take(index - start).ToArray()))
{
matchIndexList.Add(new MatchIndex(start, index - 1));
break;
}
}
}
return matchIndexList;
}
/// <summary>
/// Judge if input sentence contains some special character
/// </summary>
/// <param name="sentence">Input sentence</param>
/// <returns>Match or not</returns>
public Boolean IsMatch(String sentence)
{
//最终的索引范围列表<索引下限、索引上限>
List<MatchIndex> matchIndexList = this.SearchSentence(sentence);
if (matchIndexList.Count > 0)
{
return true;
}
// Filter some fixed sensitive character
return StringUtil.IfHaveSpecialChar(sentence);
}
/// <summary>
/// Handle sentence. Use specified character to replace those sensitive characters.
/// </summary>
/// <param name="sentence">Input sentence</param>
/// <param name="replaceCh">candidate</param>
/// <returns>Sentence after manipulation</returns>
public String HandleWord(String sentence, Char replaceCh)
{
List<MatchIndex> matchIndexList = this.SearchSentence(sentence);
if (matchIndexList.Count == 0)
{
return sentence;
}
// Manipulate
Char[] chArr = sentence.ToCharArray();
foreach (var item in matchIndexList)
{
for (var i = item.Start; i <= item.End; i++)
{
chArr[i] = replaceCh;
}
}
return new String(chArr);
}
}
- 测试用例
public static void Test1()
{
String[] sensitiveWordList = { "中国", "中国人" };
String input = "我来自中国cd";
DFAUtil util = new DFAUtil(sensitiveWordList);
Console.WriteLine(util.IsMatch(input));
Console.WriteLine(util.HandleWord(input, '*'));
}
复杂度分析
- 时间复杂度
由于底层使用的是Trie树,所有分支的存储是使用哈希表,所以每次查找字符所需要的是一次操作;但是对于输入的字符串,需要进行一次完整的遍历,所以对应的时间复杂度为O(n)。 - 空间复杂度
初始化时,每一个不同的字符都会被存储一次,所以所需的空间复杂度为O(n);而查询时,无需额外的空间。所以总的空间复杂度为O(n)。
有疑问加站长微信联系(非本文作者)