给word docx文件批量加拼音,秒出结果。
相比VBA版本的改进:
1.快;(之前一个比较大的文档,VBA程序跑了接近整3天,现在用golang的两三秒就处理完了;)
2.中文字符不需要手动添加到排除集合中了;(VBA程序中不知道怎么区分,只能手动编写符号集合,遇到有漏掉的就会出错;)
3.直接一个运行程序,没有其他依赖安装项;(VBA程序需要安装python库,或者其他汉字转成拼音的程序)
另附一个修改docx拼音的程序。
运行程序网盘下载链接(链接: https://pan.baidu.com/s/1aa8wiqd1ZNBmMIUrP50-6g 提取码: d2th)
添加拼音代码:
package main
import (
"archive/zip"
"bufio"
"bytes"
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"path"
_ "regexp"
"strconv"
"strings"
"time"
"unicode"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
// "github.com/aurelien-rainone/assertgo"
"github.com/etree"
"github.com/mozillazg/go-pinyin"
)
var g_configPinyinFont string = "微软雅黑"
var g_configPinyinFontSize int = 10 // 使用值是字面值*2
var g_configPinyinUseFontSize bool = false // 是否使用同一的拼音字号
var g_configPinyinOffset int = g_configPinyinFontSize*2 - 1 // 默认是g_configPinyinFontSize
func parseConfigLine(line string) error {
line2 := strings.TrimSpace(line)
vals := strings.Split(line2, "=")
if 2 != len(vals) {
return errors.New("无效配置:" + line)
}
switch vals[0] {
case "拼音字体":
{
if "" != vals[1] {
g_configPinyinFont = vals[1]
}
}
case "拼音字号":
{
switch len(vals) {
case 1:
{
// 默认情况
break
}
case 2:
{
if "" != vals[1] {
v, err := strconv.Atoi(vals[1])
if (nil != err) || (v < 1) {
return errors.New(fmt.Sprintf("字号无效:%s: %s\r\n", line, err))
}
g_configPinyinUseFontSize = true
g_configPinyinFontSize = v
}
}
default:
{
return errors.New(fmt.Sprintf("字号无效:%s\r\n", line))
}
}
}
// case "拼音偏移":
// {
// v, err := strconv.Atoi(vals[1])
// if (nil != err) || (v < 0) {
// return errors.New(fmt.Sprintf("偏移无效:%s: %s\r\n", line, err))
// }
// g_configPinyinOffset = v
// }
default:
{
return errors.New("无效配置:" + line)
}
}
return nil
}
func parseConfig() error {
fileName := "拼音配置.txt"
fileinfo, err := os.Stat(fileName)
if nil != err {
// 文件不存在,创建一个新的,并写入注释说明
newFile, err := os.Create(fileName)
if err != nil {
return err
}
defer newFile.Close()
newFile.WriteString(
fmt.Sprintf(
"# 说明:只支持docx格式的word文档;拼音字号默认动态变化,指定字号后全篇拼音使用指定字号;\r\n"+
"拼音字体=%s\r\n"+
"拼音字号=\r\n", // 默认不填写具体值
// +"拼音偏移=\r\n"
g_configPinyinFont,
// g_configPinyinFontSize/2,
// g_configPinyinOffset,
),
// fmt.Sprintf("# 说明:只支持docx格式的word文档\r\n拼音字体=%s",
// g_configPinyinFont,
// ),
)
} else if fileinfo.IsDir() {
return errors.New("无法创建配置文件:" + fileName)
} else {
f, err := os.Open(fileName)
if err != nil {
return nil
}
br := bufio.NewReader(f)
for {
line, _, err := br.ReadLine()
if err == io.EOF {
break
}
strLine := strings.TrimLeft(string(line), " ")
if strings.HasPrefix(strLine, "#") {
continue
}
if err = parseConfigLine(strLine); nil != err {
return err
}
}
}
return nil
}
func createWrPrNode(wrPrNode *etree.Element) *etree.Element {
newwrPr := wrPrNode.Copy()
wrFonts := newwrPr.FindElement("w:rFonts")
if nil == wrFonts {
panic("no w:rFonts")
}
whint := wrFonts.SelectAttr("w:hint")
if nil != whint {
whint.Value = "default"
} else {
// panic("no w:hint")
// wrFonts.CreateAttr("w:hint", "default")
}
return newwrPr
}
func createWrBegin(wrPrNode *etree.Element) *etree.Element {
wr := etree.NewElement("w:r")
wr.AddChild(createWrPrNode(wrPrNode))
wfldChar := etree.NewElement("w:fldChar")
wfldChar.CreateAttr("w:fldCharType", "begin")
wr.AddChild(wfldChar)
return wr
}
func createWrinstrText(text string, wrPrNode *etree.Element) *etree.Element {
if nil == wrPrNode {
panic("nil == wrPrNode")
}
wr := etree.NewElement("w:r")
wr.AddChild(createWrPrNode(wrPrNode))
wfldChar := etree.NewElement("w:instrText")
wfldChar.CreateAttr("xml:space", "preserve")
wfldChar.SetText(text)
wr.AddChild(wfldChar)
return wr
}
func createWrEnd(wrPrNode *etree.Element) *etree.Element {
wr := etree.NewElement("w:r")
wr.AddChild(createWrPrNode(wrPrNode))
wfldChar := etree.NewElement("w:fldChar")
wfldChar.CreateAttr("w:fldCharType", "end")
wr.AddChild(wfldChar)
return wr
}
func createWrwt(w string, wrPrNode *etree.Element) *etree.Element {
wr := etree.NewElement("w:r")
wr.AddChild(createWrPrNode(wrPrNode))
wt := etree.NewElement("w:t")
wt.SetText(w)
wr.AddChild(wt)
return wr
}
func validUTF8(buf []byte) int {
nBytes := 0
for i := 0; i < len(buf); i++ {
if nBytes == 0 {
if (buf[i] & 0x80) != 0 { //与操作之后不为0,说明首位为1
for (buf[i] & 0x80) != 0 {
buf[i] <<= 1 //左移一位
nBytes++ //记录字符共占几个字节
}
if nBytes < 2 || nBytes > 6 { //因为UTF8编码单字符最多不超过6个字节
return -1
}
nBytes-- //减掉首字节的一个计数
}
} else { //处理多字节字符
if buf[i]&0xc0 != 0x80 { //判断多字节后面的字节是否是10开头
return -2
}
nBytes--
}
}
return nBytes
}
func addPinyin(buf []byte) (string, error) {
pinyinArg := pinyin.NewArgs()
pinyinArg.Style = pinyin.Tone // 包含声调
doc := etree.NewDocument()
err := doc.ReadFromBytes(buf)
if nil != err {
fmt.Println(err)
transformers := []transform.Transformer{
simplifiedchinese.GBK.NewDecoder(),
simplifiedchinese.HZGB2312.NewDecoder(),
}
fmt.Println("尝试转码")
for _, t := range transformers {
I := bytes.NewReader(buf)
O := transform.NewReader(I, t)
var d []byte
d, err = ioutil.ReadAll(O)
if nil != err {
continue
}
{
// try
left := validUTF8(d)
if left < 0 {
err = errors.New("非法的utf8编码")
fmt.Println(err)
// return "", err
continue
} else if 0 != left {
d = d[:len(d)-left-1]
}
}
err = doc.ReadFromBytes(d)
if nil == err {
fmt.Println("转码成功")
break
}
}
if nil != err {
fmt.Println("转码失败")
return "", err
}
}
wdocument := doc.SelectElement("w:document")
// fmtPinyin := ""
// if g_configPinyinUseFontSize {
// // fmtPinyin := ` EQ \* jc0 \* "Font:微软雅黑" \* hps20 \o \ad(\s \up 19(%s),%s)`
// fmtPinyin = ` EQ \* jc0 \* "Font:` + g_configPinyinFont +
// `" \* hps` + strconv.Itoa(g_configPinyinFontSize) +
// ` \o \ad(\s \up ` + strconv.Itoa(g_configPinyinFontSize-1) +
// `(%s),%s)`
// }
// w:p是段落,一段一段的处理
for _, wp := range wdocument.FindElements("w:body/w:p") {
for _, wr := range wp.FindElements("w:r") {
wrPr := wr.FindElement("w:rPr")
pinyinFontSize := g_configPinyinFontSize
pinyinOffset := pinyinFontSize
wsz := wrPr.FindElement("w:sz")
if nil != wsz {
s := wsz.SelectAttrValue("w:val", "")
i, err := strconv.Atoi(s)
if nil == err {
if !g_configPinyinUseFontSize {
pinyinFontSize = i / 2
}
pinyinOffset = i / 2
}
}
// fmtPinyin := ` EQ \* jc0 \* "Font:微软雅黑" \* hps20 \o \ad(\s \up 19(%s),%s)`
fmtPinyin := ` EQ \* jc0 \* "Font:` + g_configPinyinFont +
`" \* hps` + strconv.Itoa(pinyinFontSize) +
` \o \ad(\s \up ` + strconv.Itoa(pinyinOffset-1) +
`(%s),%s)`
if wt := wr.FindElement("w:t"); nil != wt {
text := wt.Text()
// fmt.Println(text)
pinyins := pinyin.Pinyin(text, pinyinArg)
pinyinIndex := 0
for _, w := range text {
if unicode.Is(unicode.Han, w) {
wp.InsertChild(wr, createWrBegin(wrPr))
newText := fmt.Sprintf(fmtPinyin, pinyins[pinyinIndex][0], string(w))
wp.InsertChild(wr, createWrinstrText(newText, wrPr))
pinyinIndex++
wp.InsertChild(wr, createWrEnd(wrPr))
} else {
wp.InsertChild(wr, createWrwt(string(w), wrPr))
}
}
wp.RemoveChild(wr)
}
}
}
newXml, err := doc.WriteToString()
if nil != err {
fmt.Println(err)
return "", err
}
return newXml, nil
}
func procOneDocxFile(fromPath string, toPath string) error {
zipReader, err := zip.OpenReader(fromPath)
if err != nil {
fmt.Print(err)
return err
}
defer zipReader.Close()
newZipFile, err := os.Create(toPath)
if err != nil {
fmt.Println(err)
return err
}
defer newZipFile.Close()
zipWriter := zip.NewWriter(newZipFile)
defer zipWriter.Close()
var f *zip.File
for _, file := range zipReader.File {
rc, err := file.Open()
if nil != err {
return err
}
buf := make([]byte, file.UncompressedSize)
// zipfile文件一次可能不能读完,循环读完为止
readLen := 0
for file.UncompressedSize != uint32(readLen) {
n, err := rc.Read(buf[readLen:])
if nil != err && (0 != strings.Compare("EOF", err.Error())) {
fmt.Println(err)
return err
}
if 0 == n {
return errors.New("读取zip出错")
}
readLen += n
}
var newBuf []byte
if "word/document.xml" == file.Name {
f = file
if file.UncompressedSize != uint32(readLen) {
panic("读取错误")
}
newXmlStr, err := addPinyin(buf)
if nil != err {
return err
}
newBuf = []byte(newXmlStr)
} else {
newBuf = buf
}
newFile, err := zipWriter.Create(file.Name)
if err != nil {
return err
}
_, err = newFile.Write(newBuf)
if err != nil {
return err
}
}
if nil == f {
err = errors.New(fromPath + ": 没有 word/document.xml")
return err
}
return nil
}
func initDir(paths []string) error {
for _, path := range paths {
fileinfo, err := os.Stat(path)
if nil != err {
err = os.Mkdir(path, os.ModePerm)
if err != nil {
fmt.Println(err)
return err
}
} else if !fileinfo.IsDir() {
return errors.New("无法创建目录:" + path)
}
}
return nil
}
func main() {
var err error
var startTime, endTime time.Time
startTime = time.Now()
defer func() {
if p := recover(); nil != p {
fmt.Printf("panic recover! : %v\r\n", p)
}
if nil != err {
fmt.Printf("error : %v\r\n", err)
}
endTime = time.Now()
fmt.Println("耗时:", endTime.Sub(startTime))
fmt.Println("按任意键结束")
var data int
fmt.Scanf("%d", &data)
return
}()
todoDir := "./1-加拼音的docx-待处理"
doneDir := "./2-加拼音的docx-结果"
if err = initDir([]string{todoDir, doneDir}); nil != err {
return
}
if err = parseConfig(); nil != err {
return
}
files, err := ioutil.ReadDir(todoDir)
if nil != err {
return
}
for _, f := range files {
if f.IsDir() {
continue
}
if strings.HasPrefix(path.Base(f.Name()), "~$") {
continue
}
ext := path.Ext(f.Name())
if !strings.EqualFold(".docx", ext) {
continue
}
fmt.Println("正在处理文件:" + f.Name())
fromPath := todoDir + "/" + f.Name()
toPath := doneDir + "/" + strings.TrimSuffix(path.Base(f.Name()), ext) + time.Now().Format("_20060102_150405.docx")
err = procOneDocxFile(fromPath, toPath)
if nil != err {
return
}
}
return
}
修改拼音代码:
package main
import (
"archive/zip"
"bufio"
"bytes"
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"strings"
"time"
_ "unicode"
_ "os"
"regexp"
// "github.com/aurelien-rainone/assertgo"
"github.com/etree"
_ "github.com/mozillazg/go-pinyin"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
)
type wordGroupPinyin_t struct {
hansAll string
hans []string
pinyins []string
}
var specifiedDic map[string][]wordGroupPinyin_t = map[string][]wordGroupPinyin_t{
// "马": []wordGroupPinyin_t{
// wordGroupPinyin_t{
// hansAll: "马拉松",
// hans: []string{"马", "拉", "松"},
// pinyins: []string{"ma", "lalala", "song"},
// },
}
func parseConfigLine(line string) error {
lineHans := strings.Split(line, "")
if 0 == len(lineHans) {
return nil
}
firstHan := lineHans[0]
wordGroupPinyinArr := specifiedDic[firstHan]
if nil == wordGroupPinyinArr {
wordGroupPinyinArr = []wordGroupPinyin_t{}
}
hans := []string{}
pinyins := []string{}
inPinyin := false
tmpPinyin := ""
for i := 0; i < len(lineHans); i++ {
w := lineHans[i]
if !inPinyin {
if "[" == w {
inPinyin = true
} else {
hans = append(hans, w)
pinyins = append(pinyins, "")
}
} else {
if "]" == w {
pinyins[len(pinyins)-1] = tmpPinyin
tmpPinyin = ""
if len(pinyins) != len(hans) {
panic("len(pinyins) != len(hans)")
}
inPinyin = false
} else {
tmpPinyin += w
}
}
}
if inPinyin || ("" != tmpPinyin) {
err := errors.New(line + ": 拼音没有结束")
fmt.Println(err)
return err
}
wordGroupPinyin := wordGroupPinyin_t{
hansAll: strings.Join(hans, ""),
hans: hans,
pinyins: pinyins,
}
wordGroupPinyinArr = append(wordGroupPinyinArr, wordGroupPinyin)
specifiedDic[firstHan] = wordGroupPinyinArr
return nil
}
func parseConfig() error {
fileName := "需要修改的拼音.txt"
fileinfo, err := os.Stat(fileName)
if nil != err {
// 文件不存在,创建一个新的,并写入注释说明
newFile, err := os.Create(fileName)
if err != nil {
return err
}
defer newFile.Close()
newFile.WriteString("# 说明:只支持docx格式的word文档\r\n# 请指定词组读音,每行一个词组,没有指定拼音的字不修改注音,例:归还[huán]\r\n")
} else if fileinfo.IsDir() {
return errors.New("无法创建配置文件:" + fileName)
} else {
f, err := os.Open(fileName)
if err != nil {
return nil
}
br := bufio.NewReader(f)
for {
line, _, err := br.ReadLine()
if err == io.EOF {
break
}
strLine := strings.TrimLeft(string(line), " ")
if strings.HasPrefix(strLine, "#") {
continue
}
if err = parseConfigLine(strLine); nil != err {
return err
}
}
}
return nil
}
func procSpecified(wpHans []string, wpWinstrTexts []*etree.Element, reg *regexp.Regexp, fmtValue string) {
if len(wpHans) != len(wpWinstrTexts) {
panic("len(wpHans) != len(wpWinstrTexts)")
}
for i := 0; i < len(wpHans); i++ {
han := wpHans[i]
if wordGroupPinyinArr := specifiedDic[han]; nil != wordGroupPinyinArr {
sentence := strings.Join(wpHans[i:], "")
// 选最长匹配
num := 0
var wordGroupPinyin wordGroupPinyin_t
for _, a := range wordGroupPinyinArr {
if strings.HasPrefix(sentence, a.hansAll) {
if num < len(a.hans) {
num = len(a.hans)
wordGroupPinyin = a
}
}
}
if 0 != num {
for j := 0; j < len(wordGroupPinyin.hans); j++ {
if len(wpWinstrTexts) <= (i + j) {
panic("越界了")
}
wpWinstrText := wpWinstrTexts[i+j]
if nil == wpWinstrText {
panic("wpWinstrText 不能为空")
}
if nil == wpWinstrText {
// 不是注音的,不处理
continue
}
text := wpWinstrText.Text()
vals := reg.FindAllStringSubmatch(text, 1)
if (0 == len(vals)) || (len(vals[0]) < 3) {
panic("(0 == len(vals)) || (len(vals[0]) < 3)")
}
if vals[0][2] != wordGroupPinyin.hans[j] {
panic("vals[0][2] != wordGroupPinyin.hans[j]")
}
// if !(vals[0][2] == wordGroupPinyin.hans[j]) {
// assert.True(vals[0][2] == han, "2")
// }
pinyin := wordGroupPinyin.pinyins[j]
if "" != pinyin {
newText := reg.ReplaceAllStringFunc(text, func(src string) string {
return fmt.Sprintf(fmtValue, pinyin, wordGroupPinyin.hans[j])
})
// fmt.Printf("change %s to %s\r\n", text, newText)
wpWinstrText.SetText(newText)
}
}
i += len(wordGroupPinyin.hans) - 1
}
}
}
return
}
func validUTF8(buf []byte) int {
nBytes := 0
for i := 0; i < len(buf); i++ {
if nBytes == 0 {
if (buf[i] & 0x80) != 0 { //与操作之后不为0,说明首位为1
for (buf[i] & 0x80) != 0 {
buf[i] <<= 1 //左移一位
nBytes++ //记录字符共占几个字节
}
if nBytes < 2 || nBytes > 6 { //因为UTF8编码单字符最多不超过6个字节
return -1
}
nBytes-- //减掉首字节的一个计数
}
} else { //处理多字节字符
if buf[i]&0xc0 != 0x80 { //判断多字节后面的字节是否是10开头
return -2
}
nBytes--
}
}
return nBytes
}
func modifyPinyin(buf []byte) (string, error) {
doc := etree.NewDocument()
err := doc.ReadFromBytes(buf)
if nil != err {
fmt.Println(err)
transformers := []transform.Transformer{
simplifiedchinese.GBK.NewDecoder(),
simplifiedchinese.HZGB2312.NewDecoder(),
}
fmt.Println("尝试转码")
for _, t := range transformers {
I := bytes.NewReader(buf)
O := transform.NewReader(I, t)
var d []byte
d, err = ioutil.ReadAll(O)
if nil != err {
continue
}
{
// try
left := validUTF8(d)
if left < 0 {
err = errors.New("非法的utf8编码")
fmt.Println(err)
// return "", err
continue
} else if 0 != left {
d = d[:len(d)-left-1]
}
}
err = doc.ReadFromBytes(d)
if nil == err {
fmt.Println("转码成功")
break
}
}
if nil != err {
fmt.Println("转码失败")
return "", err
}
}
wdocument := doc.SelectElement("w:document")
xmlValue := `\(([^(]*?)\),([^(]*?)\)$`
fmtValue := `(%s),%s)`
regValue, _ := regexp.Compile(xmlValue)
// w:p是段落,一段一段的处理
for _, wp := range wdocument.FindElements("w:body/w:p") {
wpHans := []string{}
wpWinstrTexts := []*etree.Element{}
text := ""
for _, wr := range wp.FindElements("w:r") {
if winstrText := wr.FindElement("w:instrText"); nil != winstrText {
text += winstrText.Text() // 有的一条分成了3个<w:r>标签来写
vals := regValue.FindAllStringSubmatch(text, -1)
if 0 != len(vals) {
if len(vals[0]) < 3 {
panic("解析出错")
}
han := vals[0][2]
wpHans = append(wpHans, han)
winstrText.SetText(text)
wpWinstrTexts = append(wpWinstrTexts, winstrText)
text = ""
} else {
winstrText.Parent().RemoveChild(winstrText)
// wpWinstrTexts = append(wpWinstrTexts, winstrText)
continue
}
} else if wt := wr.FindElement("w:t"); nil != wt {
if "" != text {
panic("text 不为空")
}
// text = ""
wpHans = append(wpHans, wt.Text())
wpWinstrTexts = append(wpWinstrTexts, nil)
}
}
procSpecified(wpHans, wpWinstrTexts, regValue, fmtValue)
// fmt.Println(wpHans)
}
// doc.WriteTo(os.Stdout)
return doc.WriteToString()
}
func procOneDocxFile(fromPath string, toPath string) error {
zipReader, err := zip.OpenReader(fromPath)
if err != nil {
fmt.Print(err)
return err
}
defer zipReader.Close()
newZipFile, err := os.Create(toPath)
if err != nil {
fmt.Println(err)
return err
}
defer newZipFile.Close()
zipWriter := zip.NewWriter(newZipFile)
defer zipWriter.Close()
var f *zip.File
for _, file := range zipReader.File {
rc, err := file.Open()
if nil != err {
return err
}
buf := make([]byte, file.UncompressedSize)
// zipfile文件一次可能不能读完,循环读完为止
readLen := 0
for file.UncompressedSize != uint32(readLen) {
n, err := rc.Read(buf[readLen:])
if nil != err && (0 != strings.Compare("EOF", err.Error())) {
fmt.Println(err)
return err
}
if 0 == n {
return errors.New("读取zip出错")
}
readLen += n
}
var newBuf []byte
if "word/document.xml" == file.Name {
f = file
if file.UncompressedSize != uint32(readLen) {
panic("读取错误")
}
newXmlStr, err := modifyPinyin(buf)
if nil != err {
return err
}
newBuf = []byte(newXmlStr)
} else {
newBuf = buf
}
newFile, err := zipWriter.Create(file.Name)
if err != nil {
return err
}
_, err = newFile.Write(newBuf)
if err != nil {
return err
}
}
if nil == f {
err = errors.New(fromPath + ": 没有 word/document.xml")
return err
}
return nil
}
func initDir(paths []string) error {
for _, path := range paths {
fileinfo, err := os.Stat(path)
if nil != err {
err = os.Mkdir(path, os.ModePerm)
if err != nil {
fmt.Println(err)
return err
}
} else if !fileinfo.IsDir() {
return errors.New("无法创建目录:" + path)
}
}
return nil
}
func main() {
var err error
var startTime, endTime time.Time
startTime = time.Now()
defer func() {
if p := recover(); nil != p {
fmt.Printf("panic recover! : %v\r\n", p)
}
if nil != err {
fmt.Printf("error : %v\r\n", err)
}
endTime = time.Now()
fmt.Println("耗时:", endTime.Sub(startTime))
fmt.Println("按任意键结束")
var data int
fmt.Scanf("%d", &data)
return
}()
todoDir := "./3-修改拼音的docx-待处理"
doneDir := "./4-修改拼音的docx-结果"
if err = initDir([]string{todoDir, doneDir}); nil != err {
return
}
if err = parseConfig(); nil != err {
return
}
files, err := ioutil.ReadDir(todoDir)
if nil != err {
return
}
for _, f := range files {
if f.IsDir() {
continue
}
if strings.HasPrefix(path.Base(f.Name()), "~$") {
continue
}
ext := path.Ext(f.Name())
if !strings.EqualFold(".docx", ext) {
continue
}
fmt.Println("正在处理文件:" + f.Name())
fromPath := todoDir + "/" + f.Name()
toPath := doneDir + "/" + strings.TrimSuffix(path.Base(f.Name()), ext) + time.Now().Format("_20060102_150405.docx")
err = procOneDocxFile(fromPath, toPath)
if nil != err {
return
}
}
return
}
有疑问加站长微信联系(非本文作者)