using AIProofread.core; using Microsoft.Office.Interop.Word; using Newtonsoft.Json; using NPOI; using NPOI.POIFS.FileSystem; using NPOI.XWPF.UserModel; using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.IO.Compression; using System.Runtime.InteropServices; using System.Security.Cryptography; namespace AIProofread { public class Tools { private static readonly string[] paragSplitor = new string[] { "\r\n", "\r\f", "\r", "\n", "\f" }; /// /// 采用NPOI进行文本读取 /// /// /// /// public static Dictionary GetAllText(Microsoft.Office.Interop.Word.Document doc) { string ext = doc.FullName.ToLower(); // 如果是 //if (ext.EndsWith(".wps") || doc.Paragraphs.Count < 200 || doc.Tables.Count < 20) //{ // // 如果段落数小于200或表格小于20 则直接使用vsto 获取数据 // return GetAllTextByVSTO(doc); //} // 创建临时文件 方便数据读取 string docPath = Tools.GetReadDocumentFilePath(doc); Debug.WriteLine("GetAllText Start ==>", DateTime.Now.ToLongTimeString()); // 获取当前文档所有文本 string allText = doc.Range().Text; List list = new List(); //DocumentReader.ReadByVSTO(doc, Globals.ThisAddIn.Application, list); try { ReadTextByNPOI(docPath, list); } catch (POIXMLException ex) { // 编号有误,移除编号再读取 if (ex.StackTrace.Contains("NPOI.XWPF.UserModel.XWPFNumbering")) { RemoveNumbersReadTextByNPOI(docPath, list); } } //using (FileStream stream = new FileStream(docPath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) //{ // //try // //{ // //} // //catch (POIXMLException npoiError) // //{ // // Logger.Log("GetAllText Error", npoiError); // // // Logger.Log("GetAllText", e); // // // return GetAllTextByVSTO(doc); // // // 读取文档 // // DocumentReader.ReadByVSTO(doc, Globals.ThisAddIn.Application, list); // //} //} var map = new Dictionary { { "list", list }, { "text", allText } }; // 删除临时文件 File.Delete(docPath); return map; } private static void RemoveNumbersReadTextByNPOI(string filePath, List list) { list.Clear(); var fs = File.OpenWrite(filePath); // 打开Zip文件,删除 numbering.xml using (ZipArchive archive = new ZipArchive(fs, ZipArchiveMode.Update)) { var numberingEntry = archive.GetEntry("word/numbering.xml"); numberingEntry?.Delete(); // 删除 numbering.xml } fs.Close(); ReadTextByNPOI(filePath, list); } private static void ReadTextByNPOI(string filePath, List list) { //, FileMode.Open, FileAccess.Read, FileShare.ReadWrite using (var stream = File.OpenRead(filePath)) { XWPFDocument ndoc = new XWPFDocument(stream); var paragraphs = ndoc.Paragraphs; int paragraphNumber = 1; foreach (var bodyElement in ndoc.BodyElements) { // normal paragraph if (bodyElement is XWPFParagraph p) { // 处理普通段落 var text = p.ParagraphText.Replace("\u0002", ""); if (text.Trim().Length > 0) { list.Add(new DocumentText(text, paragraphNumber)); } paragraphNumber++; } // table -- vsto对于每个单元格的分段也会有 else if (bodyElement is XWPFTable table) { foreach (var row in table.Rows) { foreach (var cell in row.GetTableCells()) { foreach (var pc in cell.Paragraphs) { list.Add(new DocumentText(pc.ParagraphText.Replace("\u0002", ""), paragraphNumber)); paragraphNumber++; } } //list.Add(string.Empty); paragraphNumber++; } } // 目录处理 else if (bodyElement is XWPFSDT sdt) { string tocText = sdt.Content.Text; // 如果需要,可以进一步解析目录项// 例如,按换行符拆分目录内容 var tocEntries = tocText.Split(new string[] { "\n", "\r", "\r\n", "\f" }, StringSplitOptions.None); paragraphNumber += tocEntries.Length; // 暂时跳过目录 //foreach (string entry in tocEntries) //{ // list.Add(new DocumentText(entry.Replace("\u0002", ""), paragraphNumber)); // paragraphNumber++; //} } } } } public static string GetReadDocumentFilePath(Microsoft.Office.Interop.Word.Document doc) { // 保存文档确保内容是最新的 doc.Save(); // string docPath = doc.FullName; string ext = Path.GetExtension(docPath); string tmpFile = Path.GetTempFileName(); //Debug.WriteLine("GetReadDocumentFilePath Start ==>", DateTime.Now.ToLongTimeString()); //Debug.WriteLine("GetReadDocumentFilePath =>{0}", docPath); // 先转成临时文件 FileStream fs = new FileStream(docPath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite); byte[] bytes = new byte[fs.Length]; fs.Read(bytes, 0, bytes.Length); FileStream fsWriter = new FileStream(tmpFile, FileMode.OpenOrCreate, FileAccess.ReadWrite); fsWriter.Write(bytes, 0, bytes.Length); // 关闭吧 fsWriter.Close(); fs.Close(); // 判断是否是doc格式 if (ext.ToLower().EndsWith("doc")) { // doc格式文档 需要转换成docx // Application app = new Application(); // 打开临时文件 var tmpDoc = app.Documents.OpenNoRepairDialog( FileName: tmpFile, AddToRecentFiles: false, ReadOnly: true, Visible: false ); string newTmpFile = docPath + ".docx"; // 将原始文档另存为docx格式文档 tmpDoc.SaveAs2( FileName: newTmpFile, FileFormat: WdSaveFormat.wdFormatXMLDocument, AddToRecentFiles: false ); // 释放资源 object missing = System.Reflection.Missing.Value; object saveOption = WdSaveOptions.wdDoNotSaveChanges; tmpDoc.Close(ref saveOption, ref missing, ref missing); app.Quit(ref saveOption, ref missing, ref missing); // 删除copy文件 File.Delete(tmpFile); tmpFile = newTmpFile; } Debug.WriteLine("GetReadDocumentFilePath =>=>{0}", tmpFile); Debug.WriteLine("GetReadDocumentFilePath End ==>", DateTime.Now.ToLongTimeString()); return tmpFile; } public static Dictionary GetAllTextByVSTO(Microsoft.Office.Interop.Word.Document doc) { // 获取当前文档所有文本 string allText = doc.Range().Text; List list = new List(); Paragraphs paragraphs = doc.Paragraphs; //FormLoading frm = new FormLoading(); //frm.Show(); int paragraphNumber = 0; foreach (Paragraph p in paragraphs) { paragraphNumber++; //if (paragraphNumber % 20 == 0) //{ // Debug.WriteLine("process paragraphNumber{0}", paragraphNumber); //} Range r = p.Range; string text = p.Range.Text; if (text.Trim().Length == 0 || text.EndsWith("\r\a") || r.Tables.Count > 0) { continue; } list.Add(new DocumentText(text.Replace("\u0002", ""), paragraphNumber)); Marshal.ReleaseComObject(p); } Marshal.ReleaseComObject(paragraphs); //for (; paragraphNumber <= total; paragraphNumber++) //{ // Paragraph p = paragraphs[paragraphNumber]; // //Logger.LogToWeb(string.Format("get paragraph {0}", paragraphNumber)); // //frm.SetLoadingText(text); // //if (text.Trim().Length > 0) // //{ // // //byte[] hash = md5.ComputeHash(Encoding.Default.GetBytes(text)); // //} //} //frm.Close(); var map = new Dictionary { { "list", list }, { "text", allText } }; return map; } public static List GetTextListByParagraphRange(int start, int end) { List list = new List(); var doc = Globals.ThisAddIn.Application.ActiveDocument; Paragraphs paragraphs = doc.Paragraphs; int total = paragraphs.Count; if (start > total) return list; start = Math.Max(start, 1); end = Math.Min(end, total); for (int paragraphNumber = start; paragraphNumber <= end; paragraphNumber++) { Paragraph p = paragraphs[paragraphNumber]; Range r = p.Range; if (r.Tables.Count > 0 || r.Endnotes.Count > 0 || r.Footnotes.Count > 0 || r.ListFormat.ListType != WdListType.wdListNoNumbering) { continue; } string text = p.Range.Text; if (text.Trim().Length > 0) { list.Add(new DocumentText(text, paragraphNumber)); } } return list; } public static string GetJSONString(object data) { return JsonConvert.SerializeObject(data, Formatting.Indented); } /// /// 生成设备唯一标识 /// /// public static string GetDeviceId() { string devicePath = Config.APP_DATA_PATH + "\\deviceId.txt"; // 如果存在则直接返回 if (File.Exists(devicePath)) { return File.ReadAllText(devicePath); } string deviceId = Guid.NewGuid().ToString().ToLower(); // 将deviceId保存为纯文本文件到程序目录 File.WriteAllText(devicePath, deviceId); return deviceId; } } }