正则表达式: ([^aeiouy])\1
这个在java和c#都可以成功,go下无法编译
如何在go下表达 \1 这个语法?
谢谢
// LemmatizerRule
package main
import (
"bufio"
"fmt"
"os"
"regexp"
"strconv"
"strings"
)
const (
RULE_DIRECT_LEFT = 0
RULE_DIRECT_RIGHT = 1
)
func main() {
fmt.Println("Start")
rules, err := LoadLemmatizerRules("englishrules.txt")
if err != nil {
panic("Bad Error!")
}
s := "cutting"
for k, v := range rules {
fmt.Println(k)
for _, vv := range v {
l, _ := vv.Apply(s, nil)
//fmt.Println(vv.source)
fmt.Println(l)
}
}
}
/** A Lemmatizer Rule which uses regular expressions.
*
* <p>
* A lemmarizer rule specifies a string substitution pattern
* used as part of the process of reducing an elaborated
* morphological form to its base form (lemma).
* </p>
*/
type LemmatizerRule struct {
/** Original rule text. */
ruleText string
/** Source pattern string to match. */
source string
/** Compiled source pattern matcher. */
compiledSource *regexp.Regexp
/** Replacement string. */
replacement string
/** Compiled VCR string matcher. */
vcrMatcher *regexp.Regexp //=compile( "([VCRA]+)" )
/** Match direction (LEFT or RIGHT). */
direction int
/** Minimum match length. */
matchLength int
/** Must match dictionary entry. */
mustMatchDictionaryEntry bool
}
func (t *LemmatizerRule) Apply(spelling string, lexicon map[string]string) (string, error) {
if t.compiledSource.MatchString(spelling) {
lemma := t.compiledSource.ReplaceAllString(spelling, t.replacement)
if t.mustMatchDictionaryEntry && lexicon != nil {
_, ok := lexicon[lemma]
if !ok {
return "", nil
}
}
return lemma, nil
}
return "", nil
}
func SplitByWhiteSpace(line string) []string {
list := []string{}
tok := ""
for _, ch := range line {
if ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t' {
if len(tok) > 0 {
list = append(list, tok)
tok = ""
}
} else {
tok += string(ch)
}
}
if len(tok) > 0 {
list = append(list, tok)
}
if len(list) == 0 {
return nil
}
return list
}
func MakeDefaultLemmatizerRule(ruleText string) (*LemmatizerRule, error) {
var rule LemmatizerRule
rule.ruleText = ruleText
rule.direction = RULE_DIRECT_RIGHT
rule.source = ""
rule.replacement = ""
rule.mustMatchDictionaryEntry = false
rule.vcrMatcher, _ = regexp.Compile("([VCRA]+)")
tokens := SplitByWhiteSpace(ruleText)
if tokens == nil {
return nil, nil
}
idx := 0
ch := tokens[0][0]
if ch == '<' || ch == '>' {
if ch == '<' {
rule.direction = RULE_DIRECT_LEFT
}
d, err := strconv.Atoi(string(tokens[0][1]))
if err != nil {
return nil, nil
}
rule.matchLength = d
idx = 1
} else if ch == '+' {
rule.mustMatchDictionaryEntry = true
idx = 1
}
rule.source = tokens[idx]
if idx+1 < len(tokens) {
rule.replacement = tokens[idx+1]
}
replacementCount := 1
if rule.matchLength > 0 {
if rule.direction == RULE_DIRECT_RIGHT {
rule.source = fmt.Sprintf("(..)%s", rule.source)
rule.replacement = fmt.Sprintf("$%d%s", replacementCount, rule.replacement)
replacementCount++
} else {
rule.matchLength--
rule.source = fmt.Sprintf("^(.{1,%d})%s", rule.matchLength, rule.source)
rule.replacement = fmt.Sprintf("$%d%s", replacementCount, rule.replacement)
replacementCount++
}
}
if strings.Index(rule.source, "CC") >= 0 {
rule.source = strings.Replace(rule.source, "CC", "([^aeiouy])\\$1", -1) //????
rule.replacement = strings.Replace(rule.replacement, "C", fmt.Sprintf("$%d", replacementCount), 1)
} else {
phonolog := rule.vcrMatcher.FindString(rule.source)
if len(phonolog) > 0 {
phonolog_orgi := phonolog
phonolog = strings.Replace(phonolog, "V", "[aeiouy]", -1)
phonolog = strings.Replace(phonolog, "C", "[^aeiouy]", -1)
phonolog = strings.Replace(phonolog, "R", "r", -1)
phonolog = strings.Replace(phonolog, "A", ".*", -1)
rule.source = strings.Replace(rule.source, phonolog_orgi, fmt.Sprintf("(%s)", phonolog), 1)
rule.replacement = strings.Replace(rule.replacement, phonolog_orgi, fmt.Sprintf("$%d", replacementCount), 1)
replacementCount++
}
}
rule.source = rule.source + "$"
cc, err := regexp.Compile(rule.source)
if err != nil {
return nil, nil
}
rule.compiledSource = cc
return &rule, nil
}
func LoadLemmatizerRules(path string) (map[string][]*LemmatizerRule, error) {
file, err := os.Open(path)
if err != nil {
panic("Bad Error!")
}
defer file.Close()
reader := bufio.NewReader(file)
result := make(map[string][]*LemmatizerRule)
PosTag := ""
cc := 0
for {
line, err := reader.ReadString('\n')
if err != nil {
break
}
line = strings.Trim(line, " \t\r\n")
if len(line) == 0 {
continue
}
if strings.Contains(line, ":") {
PosTag = strings.Trim(line, ":")
PosTag = strings.ToLower(PosTag)
continue
}
rule, err := MakeDefaultLemmatizerRule(line)
if err != nil {
continue
}
if len(PosTag) == 0 {
continue
}
_, ok := result[PosTag]
if !ok {
result[PosTag] = []*LemmatizerRule{}
}
result[PosTag] = append(result[PosTag], rule)
cc++
}
if cc == 0 {
return nil, nil
}
return result, nil
}
#6
更多评论
改成: `([^aeiouy])$1` 试试
go 用的正则跟其他的有些不太一样,参见文档:http://docs.studygolang.com/pkg/regexp/
#1