Go语言源码中Replacer查找部份的笔记

xcltapestry · · 2132 次点击 · · 开始浏览

这是一个创建于的文章，其中的信息可能已经有所发展或是发生改变。

用过strings.NewReplacer,replacer.Replace(),它按对传入参数后,能依优先级替换,并能处理中文字符串参数.

觉得功能强大,特别好用.对它的查找和优先级怎么处理有点兴趣,花时间研究了下源码,在这记录一下个人理解.

package main 

//author:xcl
//2014-1-20 记录

import (
	"fmt"
    "strings"
)

func main(){

   patterns := []string{  
            "y","25",
            "中","国",
            "中工","家伙",
        }  
      
   /*
    patterns := make([]string,270 * 2)
    for i :=0;i< 270 *2;i++{  
        patterns[i] = fmt.Sprintf("%d",i)
    }
    */
    replacer := strings.NewReplacer(patterns...)

    format := "中(国)--中工(家伙)"
    strfmt := replacer.Replace(format)    
    NewReplacer(patterns...);
    fmt.Println("\nmain() replacer.Replace old=",format)
    fmt.Println("main() replacer.Replace new=",strfmt)
}


func NewReplacer(oldnew ...string){

   r :=  makeGenericReplacer(oldnew)

   val,keylen,found := r.lookup("中",true)
   fmt.Println("\nNewReplacer() 中   val:",val," keylen:",keylen," found:",found)

   val,keylen,found = r.lookup("中工",true)
   fmt.Println("NewReplacer() 中工 val:",val," keylen:",keylen," found:",found)

   val,keylen,found = r.lookup("y",false)
   fmt.Println("NewReplacer() y    val:",val," keylen:",keylen," found:",found)


   /*
   val,keylen,found := r.lookup("2",true)
   fmt.Println("\nNewReplacer() 2   val:",val," keylen:",keylen," found:",found)

   val,keylen,found = r.lookup("3",true)
   fmt.Println("\nNewReplacer() 3   val:",val," keylen:",keylen," found:",found)
*/
}


type genericReplacer struct {
    root trieNode  //一个字典树
    // tableSize is the size of a trie node's lookup table. It is the number
    // of unique key bytes.
    tableSize int
    // mapping maps from key bytes to a dense index for trieNode.table.
    mapping [256]byte  
}

func makeGenericReplacer(oldnew []string) *genericReplacer {
    r := new(genericReplacer)
    // Find each byte used, then assign them each an index.
    for i := 0; i < len(oldnew); i += 2 { //步长2. 第一个为pattern 
        key := oldnew[i]
        fmt.Println("\nmakeGenericReplacer() for key=",key)

        //key[j]=utf8存储汉字的三个编码位置中的一个如228,则将其对应位置设置为1
        //即 r.mapping[228] = 1
        for j := 0; j < len(key); j++ {
            r.mapping[key[j]] = 1   
            fmt.Println("makeGenericReplacer() key[",j,"]=",key[j])
        }
    }

    for _, b := range r.mapping { 
        r.tableSize += int(b)  
    }
    fmt.Println("makeGenericReplacer()  r.tableSize=",r.tableSize)
 
    var index byte
    for i, b := range r.mapping {
        if b == 0 {
            r.mapping[i] = byte(r.tableSize)
        } else {
            //依数组字符编码位置,建立索引
            r.mapping[i] = index
            fmt.Println("makeGenericReplacer()  r.mapping[",i,"] =",r.mapping[i] ) 
            index++
        }
    }
    // Ensure root node uses a lookup table (for performance).
    r.root.table = make([]*trieNode, r.tableSize) 
    
    //将key,val放入字典树,注意priority=len(oldnew)-i,即越数组前面的,值越大.级别越高
    for i := 0; i < len(oldnew); i += 2 {
        r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r) 
    }
    return r
}



type trieNode struct {
    value string
    priority int

    prefix string
    next   *trieNode
    table []*trieNode 
}

func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
     fmt.Println("trieNode->add() val=",val," key=",key)
     if key == "" {
        if t.priority == 0 {
            t.value = val
            t.priority = priority
            fmt.Println("trieNode->add() t.priority==",priority)
        }
        return
    }

    if t.prefix != "" { //处理已有前缀的node   
        // Need to split the prefix among multiple nodes.
        var n int // length of the longest common prefix
        for ; n < len(t.prefix) && n < len(key); n++ { //prefix与key的比较
            if t.prefix[n] != key[n] {
                break
            }
        }
        if n == len(t.prefix) {  //相同,继续放下面
            t.next.add(key[n:], val, priority, r)
        } else if n == 0 { //没一个相同
            // First byte differs, start a new lookup table here. Looking up
            // what is currently t.prefix[0] will lead to prefixNode, and
            // looking up key[0] will lead to keyNode.
            var prefixNode *trieNode
            if len(t.prefix) == 1 {  //如果prefix只是一个字节的字符编码,则挂在节点下面
                prefixNode = t.next
            } else {                    //如果不是,将余下的新建一个trie树
                prefixNode = &trieNode{
                    prefix: t.prefix[1:],
                    next:   t.next,
                }
            }
            keyNode := new(trieNode)
            t.table = make([]*trieNode, r.tableSize) //lookup()中的if node.table != nil 

            t.table[r.mapping[t.prefix[0]]] = prefixNode 
            t.table[r.mapping[key[0]]] = keyNode    
            t.prefix = ""
            t.next = nil
            keyNode.add(key[1:], val, priority, r) 
        } else {
            // Insert new node after the common section of the prefix.
            next := &trieNode{
                prefix: t.prefix[n:],
                next:   t.next,
            }
            t.prefix = t.prefix[:n]
            t.next = next
            next.add(key[n:], val, priority, r)
        }
    } else if t.table != nil {
        // Insert into existing table.
        m := r.mapping[key[0]]
        if t.table[m] == nil {
            t.table[m] = new(trieNode)
        }
        t.table[m].add(key[1:], val, priority, r) //构建树      
    } else {  
        t.prefix = key
        t.next = new(trieNode)
        t.next.add("", val, priority, r)
    }

}


func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int,found bool) {
    // Iterate down the trie to the end, and grab the value and keylen with
    // the highest priority.
    bestPriority := 0
    node := &r.root
    n := 0

    for node != nil {
         if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
            bestPriority = node.priority
            val = node.value
            keylen = n
            found = true
        }

        if s == "" {
            break
        }

        if node.table != nil {
            index := r.mapping[s[0]]
            if int(index) == r.tableSize { //字符编码第一个字节就没在table中,中断查找
                break
            }
            node = node.table[index] 
            s = s[1:]
            n++
        } else if node.prefix != "" && HasPrefix(s, node.prefix) { 
            //字符编码非第一个字节的节点会保留key在prefix中,所以通过分析prefix来继续找其它字节
            n += len(node.prefix)
            s = s[len(node.prefix):]
            node = node.next //继续找相同prefix以外其它字符
        } else {
            break
        }
    }
    return
}


// HasPrefix tests whether the string s begins with prefix.
func HasPrefix(s, prefix string) bool {
    return len(s) >= len(prefix) && s[0:len(prefix)] == prefix
}

记录:

ascii范围内的只占一个字节,如y(121)
utf8中每个汉字占三个字节.如中(228,184,173)

构建树:
如果是新的第一个单词或词组
先进 } else if t.table != nil {
然后再进 else,这中间会把 t.prefix = key,把key值存放在prefix,将""传给下一个node
最后执行 if key == "" && t.priority == 0 { ,将 t.value = val
即key的字符编码(第一个字节)对应的root.table位置开始,依次指向另外的字符编译node,中间node的prefix存下key值.
最末一个node,存下对应的val及priority.

如果是后传入的单词或词组,先从key字符编码首个字节对应的root.table位置开始,依次查找,
} else if t.table != nil {
如果已有前缀的,进行比较 if t.prefix != "" {
1, 如目前prefix与key完全一致,则继续构建树子节点
2. 如prefix与key完全不同,则另起炉灶,构建一条新的tree
prefixNode 承上,keyNode 启下
至于为什么t.table = make([]*trieNode, r.tableSize),是为了预留映射空间.
所以它是这么弄的,而不是t.table[0],t.table[1].
t.table[r.mapping[t.prefix[0]]] = prefixNode
t.table[r.mapping[key[0]]] = keyNode
3.有部份相同, 直接跳到t.prefix[n:],然后从key[n:]开始继续构建树子节点

priority:
在这的定义是数字越大,优先级别越高

if key == "" { //字符编码中间的字节
if t.priority == 0 { //如果有定义过priority的就略过,新加的,把现有的级别加上

//对应{中,中工}这种,虽然后面有"中工",但"中",的priority要高,所以"中工"对应的值虽找到但不会返回.
if node.priority > bestPriority { bestPriority = node.priority}

例如:中工(priority=4),中(priority=2)
patterns:
"中工","家伙",
"中","国",
则:
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 2 value: 国 prefix: 工
NewReplacer() 中 val: 国 keylen: 3 found: true
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 2 value: 国 prefix: 工
lookup() bestPriority: 2 node.priority: 4 value: 家伙 prefix:
NewReplacer() 中工 val: 家伙 keylen: 6 found: true
main() replacer.Replace old= 中(国)--中工(家伙)
main() replacer.Replace new= 国(国)--家伙(家伙)

如果调整下顺序,把中->国提前,则会发现,下面的结果:
patterns:
"中","国",
"中工","家伙",
则:
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 4 value: 国 prefix: 工
NewReplacer() 中 val: 国 keylen: 3 found: true
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 4 value: 国 prefix: 工
lookup() bestPriority: 4 node.priority: 2 value: 家伙 prefix:
NewReplacer() 中工 val: 国 keylen: 3 found: true
main() replacer.Replace old= 中(国)--中工(家伙)
main() replacer.Replace new= 国(国)--国工(家伙)

还有,刚发现 lookup(s string, ignoreRoot bool) (val string, keylen int,found bool) {}中
定义在返回值中的变量,原来可以直接在函数中使用,