初级会员
  • 第 2116 位会员
  • bluealbin
  • bluealbin@126.com
  • 2014-12-24 07:56:27
  • Offline
  • 0

最近发布的文章

    暂无

最近分享的资源

    暂无

最近发布的项目

    暂无

最近的评论

  • 变乱码了 具体的rule是这个 verb: CCing C
  • rules: adjective: fu' ful sser ss ffer ff ller ll ler le pler ple CCer C ier y wer w CVCer CVCe VRCer VRCe eer ee er ssest ss ffest ff llest ll plest ple CCest C iest y west w CVCest CVCe VRCest VRCe eest ee est adverb: bly ble ily y ly verb: >1 ches ch ches che shes sh sses ss ffest ff <5 llest ll ssest ss CCest C cest ce gest ge Vthest Vthe VVlest VVl rlest rl wlest wl lest le enest en onest on erest er sest se iatest iate itest it vest ve west w xest x zest ze uVCest uVCe CVCest CVCe Crest Cer est eeth ee ffeth ff <5 lleth ll sseth ss CCeth C ceth ce geth ge Vtheth Vthe VVleth VVl rleth rl wleth wl leth le eneth en oneth on ereth er seth se iateth iate iteth it veth ve weth w xeth x zeth ze uVCeth uVCe CVCeth CVCe Creth Cer eth >1 ies y xes x ss ss! s went go tred tre ead ead >1 ied y yed y Ved Ve ffed ff <4 lled ll ssed ss CCed C ced ce ged ge Vthed Vthe VVled VVl rled rl wled wl led le ened en oned on + ored ore ored or + ered ere ered er sed se iated iate ited it ved ve wed w xed x zed ze uVCed uVCe CVCed CVCe ed aid ay fell fall gone go iven ve wn w ffing ff <4 lling ll ssing ss CCing C cing ce nging ng ging ge Vthing the rling rl wling wl VVling VVl Vling Vl ling le ening en oning on + oring ore oring or + ering ere ering er sing se iating iate iting it uing ue ving ve wing w xing x >1 ying y ying ie zing ze uVCing uVCe CVCing CVCe >1 -ing >1 ing ffin' ff <4 llin' ll ssin' ss CCin' C cin' ce ngin' ng gin' ge Vthin' the rlin' rl wlin' wl VVlin' VVl lin' le enin' en onin' on + orin' ore orin' or + erin' ere erin' er sin' se iatin' iate itin' it uin' ue vin' ve win' w xin' x >1 yin' y yin' ie zin' ze uVCin' uVCe CVCin' CVCe >1 in' 'st n't noun: men man ches che shes sh >1 ies y sses ss xes x 's s' s s ' noun-possessive: 's s' s '
  • // LemmatizerRule package main import ( "bufio" "fmt" "os" "regexp" "strconv" "strings" ) const ( RULE_DIRECT_LEFT = 0 RULE_DIRECT_RIGHT = 1 ) func main() { fmt.Println("Start") rules, err := LoadLemmatizerRules("englishrules.txt") if err != nil { panic("Bad Error!") } s := "cutting" for k, v := range rules { fmt.Println(k) for _, vv := range v { l, _ := vv.Apply(s, nil) //fmt.Println(vv.source) fmt.Println(l) } } } /** A Lemmatizer Rule which uses regular expressions. * * <p> * A lemmarizer rule specifies a string substitution pattern * used as part of the process of reducing an elaborated * morphological form to its base form (lemma). * </p> */ type LemmatizerRule struct { /** Original rule text. */ ruleText string /** Source pattern string to match. */ source string /** Compiled source pattern matcher. */ compiledSource *regexp.Regexp /** Replacement string. */ replacement string /** Compiled VCR string matcher. */ vcrMatcher *regexp.Regexp //=compile( "([VCRA]+)" ) /** Match direction (LEFT or RIGHT). */ direction int /** Minimum match length. */ matchLength int /** Must match dictionary entry. */ mustMatchDictionaryEntry bool } func (t *LemmatizerRule) Apply(spelling string, lexicon map[string]string) (string, error) { if t.compiledSource.MatchString(spelling) { lemma := t.compiledSource.ReplaceAllString(spelling, t.replacement) if t.mustMatchDictionaryEntry && lexicon != nil { _, ok := lexicon[lemma] if !ok { return "", nil } } return lemma, nil } return "", nil } func SplitByWhiteSpace(line string) []string { list := []string{} tok := "" for _, ch := range line { if ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t' { if len(tok) > 0 { list = append(list, tok) tok = "" } } else { tok += string(ch) } } if len(tok) > 0 { list = append(list, tok) } if len(list) == 0 { return nil } return list } func MakeDefaultLemmatizerRule(ruleText string) (*LemmatizerRule, error) { var rule LemmatizerRule rule.ruleText = ruleText rule.direction = RULE_DIRECT_RIGHT rule.source = "" rule.replacement = "" rule.mustMatchDictionaryEntry = false rule.vcrMatcher, _ = regexp.Compile("([VCRA]+)") tokens := SplitByWhiteSpace(ruleText) if tokens == nil { return nil, nil } idx := 0 ch := tokens[0][0] if ch == '<' || ch == '>' { if ch == '<' { rule.direction = RULE_DIRECT_LEFT } d, err := strconv.Atoi(string(tokens[0][1])) if err != nil { return nil, nil } rule.matchLength = d idx = 1 } else if ch == '+' { rule.mustMatchDictionaryEntry = true idx = 1 } rule.source = tokens[idx] if idx+1 < len(tokens) { rule.replacement = tokens[idx+1] } replacementCount := 1 if rule.matchLength > 0 { if rule.direction == RULE_DIRECT_RIGHT { rule.source = fmt.Sprintf("(..)%s", rule.source) rule.replacement = fmt.Sprintf("$%d%s", replacementCount, rule.replacement) replacementCount++ } else { rule.matchLength-- rule.source = fmt.Sprintf("^(.{1,%d})%s", rule.matchLength, rule.source) rule.replacement = fmt.Sprintf("$%d%s", replacementCount, rule.replacement) replacementCount++ } } if strings.Index(rule.source, "CC") >= 0 { rule.source = strings.Replace(rule.source, "CC", "([^aeiouy])\\$1", -1) //???? rule.replacement = strings.Replace(rule.replacement, "C", fmt.Sprintf("$%d", replacementCount), 1) } else { phonolog := rule.vcrMatcher.FindString(rule.source) if len(phonolog) > 0 { phonolog_orgi := phonolog phonolog = strings.Replace(phonolog, "V", "[aeiouy]", -1) phonolog = strings.Replace(phonolog, "C", "[^aeiouy]", -1) phonolog = strings.Replace(phonolog, "R", "r", -1) phonolog = strings.Replace(phonolog, "A", ".*", -1) rule.source = strings.Replace(rule.source, phonolog_orgi, fmt.Sprintf("(%s)", phonolog), 1) rule.replacement = strings.Replace(rule.replacement, phonolog_orgi, fmt.Sprintf("$%d", replacementCount), 1) replacementCount++ } } rule.source = rule.source + "$" cc, err := regexp.Compile(rule.source) if err != nil { return nil, nil } rule.compiledSource = cc return &rule, nil } func LoadLemmatizerRules(path string) (map[string][]*LemmatizerRule, error) { file, err := os.Open(path) if err != nil { panic("Bad Error!") } defer file.Close() reader := bufio.NewReader(file) result := make(map[string][]*LemmatizerRule) PosTag := "" cc := 0 for { line, err := reader.ReadString('\n') if err != nil { break } line = strings.Trim(line, " \t\r\n") if len(line) == 0 { continue } if strings.Contains(line, ":") { PosTag = strings.Trim(line, ":") PosTag = strings.ToLower(PosTag) continue } rule, err := MakeDefaultLemmatizerRule(line) if err != nil { continue } if len(PosTag) == 0 { continue } _, ok := result[PosTag] if !ok { result[PosTag] = []*LemmatizerRule{} } result[PosTag] = append(result[PosTag], rule) cc++ } if cc == 0 { return nil, nil } return result, nil }
  • #3 @lovegolang 编译能过,但是没效果了。我要把"cutting" 变成"cut",匹配"tt"。port别人的引擎,这条不过,就要大改了
  • #1 @lovegolang 试过了,不行啊。