285 lines
11 KiB
C#

using AIProofread.core;
using Microsoft.Office.Interop.Word;
using Newtonsoft.Json;
using NPOI;
using NPOI.POIFS.FileSystem;
using NPOI.XWPF.UserModel;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.InteropServices;
using System.Security.Cryptography;
namespace AIProofread
{
public class Tools
{
private static readonly string[] paragSplitor = new string[] { "\r\n", "\r\f", "\r", "\n", "\f" };
/// <summary>
/// 采用NPOI进行文本读取
/// </summary>
/// <param name="docPath"></param>
/// <param name="doc"></param>
/// <returns></returns>
public static Dictionary<string, object> GetAllText(Microsoft.Office.Interop.Word.Document doc)
{
string ext = doc.FullName.ToLower();
// 如果是
//if (ext.EndsWith(".wps") || doc.Paragraphs.Count < 200 || doc.Tables.Count < 20)
//{
// // 如果段落数小于200或表格小于20 则直接使用vsto 获取数据
// return GetAllTextByVSTO(doc);
//}
// 创建临时文件 方便数据读取
string docPath = Tools.GetReadDocumentFilePath(doc);
Debug.WriteLine("GetAllText Start ==>", DateTime.Now.ToLongTimeString());
// 获取当前文档所有文本
string allText = doc.Range().Text;
List<DocumentText> list = new List<DocumentText>();
//DocumentReader.ReadByVSTO(doc, Globals.ThisAddIn.Application, list);
using (FileStream stream = new FileStream(docPath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
ReadTextByNPOI(stream, list);
//try
//{
//}
//catch (POIXMLException npoiError)
//{
// Logger.Log("GetAllText Error", npoiError);
// // Logger.Log("GetAllText", e);
// // return GetAllTextByVSTO(doc);
// // 读取文档
// DocumentReader.ReadByVSTO(doc, Globals.ThisAddIn.Application, list);
//}
}
var map = new Dictionary<string, object>
{
{ "list", list },
{ "text", allText }
};
// 删除临时文件
File.Delete(docPath);
return map;
}
private static void ReadTextByNPOI(FileStream stream, List<DocumentText> list)
{
XWPFDocument ndoc = new XWPFDocument(stream);
var paragraphs = ndoc.Paragraphs;
int paragraphNumber = 1;
foreach (var bodyElement in ndoc.BodyElements)
{
// normal paragraph
if (bodyElement is XWPFParagraph p)
{
// 处理普通段落
var text = p.ParagraphText.Replace("\u0002", "");
if(text.Trim().Length > 0)
{
list.Add(new DocumentText(text, paragraphNumber));
}
paragraphNumber++;
}
// table -- vsto对于每个单元格的分段也会有
else if (bodyElement is XWPFTable table)
{
foreach (var row in table.Rows)
{
foreach (var cell in row.GetTableCells())
{
foreach (var pc in cell.Paragraphs)
{
list.Add(new DocumentText(pc.ParagraphText.Replace("\u0002", ""), paragraphNumber));
paragraphNumber++;
}
}
//list.Add(string.Empty);
paragraphNumber++;
}
}
// 目录处理
else if (bodyElement is XWPFSDT sdt)
{
string tocText = sdt.Content.Text;
// 如果需要,可以进一步解析目录项// 例如,按换行符拆分目录内容
var tocEntries = tocText.Split(new string[] { "\n", "\r", "\r\n", "\f" }, StringSplitOptions.None);
paragraphNumber+= tocEntries.Length;
// 暂时跳过目录
//foreach (string entry in tocEntries)
//{
// list.Add(new DocumentText(entry.Replace("\u0002", ""), paragraphNumber));
// paragraphNumber++;
//}
}
}
}
public static string GetReadDocumentFilePath(Microsoft.Office.Interop.Word.Document doc)
{
//
string docPath = doc.FullName;
string ext = Path.GetExtension(docPath);
string tmpFile = Path.GetTempFileName();
Debug.WriteLine("GetReadDocumentFilePath Start ==>", DateTime.Now.ToLongTimeString());
Debug.WriteLine("GetReadDocumentFilePath =>{0}", docPath);
// 先转成临时文件
FileStream fs = new FileStream(docPath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
byte[] bytes = new byte[fs.Length];
fs.Read(bytes, 0, bytes.Length);
FileStream fsWriter = new FileStream(tmpFile, FileMode.OpenOrCreate, FileAccess.ReadWrite);
fsWriter.Write(bytes, 0, bytes.Length);
// 关闭吧
fsWriter.Close();
fs.Close();
// 判断是否是doc格式
if (ext.ToLower().EndsWith("doc"))
{
// doc格式文档 需要转换成docx
//
Application app = new Application();
// 打开临时文件
var tmpDoc = app.Documents.OpenNoRepairDialog(
FileName: tmpFile,
AddToRecentFiles: false,
ReadOnly: true,
Visible: false
);
string newTmpFile = docPath + ".docx";
// 将原始文档另存为docx格式文档
tmpDoc.SaveAs2(
FileName: newTmpFile,
FileFormat: WdSaveFormat.wdFormatXMLDocument,
AddToRecentFiles: false
);
// 释放资源
object missing = System.Reflection.Missing.Value;
object saveOption = WdSaveOptions.wdDoNotSaveChanges;
tmpDoc.Close(ref saveOption, ref missing, ref missing);
app.Quit(ref saveOption, ref missing, ref missing);
// 删除copy文件
File.Delete(tmpFile);
tmpFile = newTmpFile;
}
Debug.WriteLine("GetReadDocumentFilePath =>=>{0}", tmpFile);
Debug.WriteLine("GetReadDocumentFilePath End ==>", DateTime.Now.ToLongTimeString());
return tmpFile;
}
public static Dictionary<string, object> GetAllTextByVSTO(Microsoft.Office.Interop.Word.Document doc)
{
// 获取当前文档所有文本
string allText = doc.Range().Text;
List<DocumentText> list = new List<DocumentText>();
Paragraphs paragraphs = doc.Paragraphs;
//FormLoading frm = new FormLoading();
//frm.Show();
int paragraphNumber = 0;
foreach (Paragraph p in paragraphs)
{
paragraphNumber++;
//if (paragraphNumber % 20 == 0)
//{
// Debug.WriteLine("process paragraphNumber{0}", paragraphNumber);
//}
Range r = p.Range;
string text = p.Range.Text;
if (text.Trim().Length == 0 || text.EndsWith("\r\a") || r.Tables.Count > 0)
{
continue;
}
list.Add(new DocumentText(text.Replace("\u0002", ""), paragraphNumber));
Marshal.ReleaseComObject(p);
}
Marshal.ReleaseComObject(paragraphs);
//for (; paragraphNumber <= total; paragraphNumber++)
//{
// Paragraph p = paragraphs[paragraphNumber];
// //Logger.LogToWeb(string.Format("get paragraph {0}", paragraphNumber));
// //frm.SetLoadingText(text);
// //if (text.Trim().Length > 0)
// //{
// // //byte[] hash = md5.ComputeHash(Encoding.Default.GetBytes(text));
// //}
//}
//frm.Close();
var map = new Dictionary<string, object>
{
{ "list", list },
{ "text", allText }
};
return map;
}
public static List<DocumentText> GetTextListByParagraphRange(int start, int end)
{
List<DocumentText> list = new List<DocumentText>();
var doc = Globals.ThisAddIn.Application.ActiveDocument;
Paragraphs paragraphs = doc.Paragraphs;
int total = paragraphs.Count;
if (start > total) return list;
start = Math.Max(start, 1);
end = Math.Min(end, total);
for (int paragraphNumber = start; paragraphNumber <= end; paragraphNumber++)
{
Paragraph p = paragraphs[paragraphNumber];
Range r = p.Range;
if (r.Tables.Count > 0
|| r.Endnotes.Count > 0
|| r.Footnotes.Count > 0
|| r.ListFormat.ListType != WdListType.wdListNoNumbering)
{
continue;
}
string text = p.Range.Text;
if (text.Trim().Length > 0)
{
list.Add(new DocumentText(text, paragraphNumber));
}
}
return list;
}
public static string GetJSONString(object data)
{
return JsonConvert.SerializeObject(data, Formatting.Indented);
}
/// <summary>
/// 生成设备唯一标识
/// </summary>
/// <returns></returns>
public static string GetDeviceId()
{
string devicePath = Config.APP_DATA_PATH + "\\deviceId.txt";
// 如果存在则直接返回
if (File.Exists(devicePath))
{
return File.ReadAllText(devicePath);
}
string deviceId = Guid.NewGuid().ToString().ToLower();
// 将deviceId保存为纯文本文件到程序目录
File.WriteAllText(devicePath, deviceId);
return deviceId;
}
}
}