后台处理文档

This commit is contained in:
LittleBoy 2024-08-17 10:09:31 +08:00
parent 097ec94001
commit 9dbe1d5bf4
3 changed files with 224 additions and 52 deletions

View File

@ -1,4 +1,5 @@
using AIProofread.Controls;
using AIProofread.Util;
using Microsoft.Office.Interop.Word;
using Microsoft.Office.Tools.Word;
using Microsoft.Web.WebView2.Core;
@ -11,8 +12,10 @@ using System.Drawing;
using System.IO;
using System.Runtime.InteropServices;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using UtilLib;
using Document = Microsoft.Office.Interop.Word.Document;
using Task = System.Threading.Tasks.Task;
namespace AIProofread
{
@ -149,21 +152,114 @@ namespace AIProofread
// 获取文档所有文本数据
public Dictionary<string, object> getAllText()
{
return Tools.GetAllText();
return Tools.GetAllText(Globals.ThisAddIn.Application.ActiveDocument);
}
public string getDocumentData()
{
Dictionary<string, object> data = new Dictionary<string, object>();
var name = Globals.ThisAddIn.Application.ActiveDocument.Name;
var doc = Globals.ThisAddIn.Application.ActiveDocument;
data.Add("name", name);
data.Add("name", doc.Name);
data.Add("fullName", doc.FullName);
data.Add("wordsCount", doc.Words.Count);
data.Add("charactersCount", doc.Characters.Count);
data.Add("content", Tools.GetAllText());
Application app = new Application();
object missing = System.Reflection.Missing.Value;
object saveOption = WdSaveOptions.wdDoNotSaveChanges;
// 创建临时文件
string path = CreateTempDocumentFile(doc);
Document tmpDoc = app.Documents.Open(path, false, true, false);
data.Add("content", Tools.GetAllText(tmpDoc));
// 关闭并释放对象
tmpDoc.Close(ref saveOption, ref missing, ref missing);
Marshal.ReleaseComObject(tmpDoc);
app.Quit(ref saveOption,ref missing,ref missing);
Marshal.ReleaseComObject(app);
// 显式回收一下吧
GC.Collect();
File.Delete(path);
return Tools.GetJSONString(data);
}
private string CreateTempDocumentFile(Document doc)
{
// 先转成临时文件
FileStream fs = new FileStream(doc.FullName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
//
var tmpFile = Path.GetTempFileName();
byte[] bytes = new byte[fs.Length];
fs.Read(bytes, 0, bytes.Length);
FileStream tmp = new FileStream(tmpFile, FileMode.OpenOrCreate,FileAccess.ReadWrite);
tmp.Write(bytes, 0, bytes.Length);
// 关闭吧
tmp.Close();
fs.Close();
return tmpFile;
}
public string getParagraphTextByRange(int start, int end)
{
var list = Tools.GetTextListByParagraphRange(start, end);
return Tools.GetJSONString(list);
}
public string getAllParagraphs()
{
var doc = Globals.ThisAddIn.Application.ActiveDocument;
var rangeText = doc.Content.Text;
var trimText = HostHelper.ReplaceSpecialChars(rangeText, isReplaceMultSpaceLine: true);
string[] separator = new string[5] { "\r\a", "\a", "\r", "\v", "\f" };
string[] array4 = rangeText.Split(separator, StringSplitOptions.None);
string[] array5 = trimText.Split('\n');
List<string> list = new List<string>();
var paragraphs = doc.Paragraphs;
int total = paragraphs.Count;
for (int i = 1; i <= total; i++)
{
list.Add(GetParagraphText(paragraphs[i]));
}
var array6 = list.ToArray();
Dictionary<string, object> data = new Dictionary<string, object>
{
{"origin_cut",array4 },
{"trim_cut",array5 },
{"paragraph_cut",array6 },
};
return Tools.GetJSONString(data);
}
private string GetParagraphText(Paragraph paragraph)
{
// 需要
return GetRangeText(paragraph.Range);
}
private string GetRangeText(Range range)
{
// 需要
return range.Text;
}
public void getParagraphTextByRangeSync(int start,int end)
{
//var list = Tools.GetTextListByParagraphRange(start, end);
//return Tools.GetJSONString(list);
Task.Run(() =>
{
var list = Tools.GetTextListByParagraphRange(start, end);
Globals.ThisAddIn.SendMessageToWeb("getParagraphTextByRange", Tools.GetJSONString(list));
});
}
public int getTotalParagraphNumber()
{
return Globals.ThisAddIn.Application.ActiveDocument.Paragraphs.Count;
}
/// <summary>
/// 读取文档原始文件并转换成base64
/// </summary>
@ -482,18 +578,28 @@ namespace AIProofread
int index = 0;
foreach (var item in correct.Diffs)
{
if (item.idx == 3330)
{
Console.WriteLine("xx");
}
var mark = AddBookmark(item, index, correct.Sentence_offset, correct.Insert_len, correct.Paragraph_num);
if (item.tag != "i") index++;
var msg = new Dictionary<string, object>{
{"message",mark == null ? "没有找到标记对象":"标记对象" + mark.Name },
{ "origin",item }
};
if (mark != null)
{
marks.Add(item.id, new ProofreadItem(item, mark));
}
else
{
var msg = new Dictionary<string, object>{
{"message","没有找到标记对象" },
{ "origin",item },
{ "origin_correct",correct },
{ "new_text",correct.New_text },
{ "paragraph_num",correct.Paragraph_num },
};
Logger.Log(JsonConvert.SerializeObject(msg));
}
Logger.LogToWeb(JsonConvert.SerializeObject(msg));
}
}
}

View File

@ -0,0 +1,52 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace AIProofread.Util
{
public class HostHelper
{
public static string ReplaceSpecialChars(string text, bool isReplaceMultSpaceLine = false)
{
Regex regex = new Regex("[\r\f\a\v]{1}");
if (!string.IsNullOrEmpty(text))
{
text = regex.Replace(text, "\n").Replace("\u001e", "-");
if (isReplaceMultSpaceLine)
{
bool flag = text.Last() == '\n';
text = string.Join("\n", text.Split(new char[1] { '\n' }, StringSplitOptions.RemoveEmptyEntries));
if (flag)
{
text += "\n";
}
}
}
return text;
}
public static string RemoveInvisibleCharas(string text, bool containTable = false)
{
List<char> list = new List<char> { '\r', '\a', '\n', '\f', '\v' };
if (containTable)
{
list.Add('\t');
}
return string.IsNullOrEmpty(text) ? string.Empty : text.Trim(list.ToArray());
}
public static bool HasValidCharacters(string text)
{
List<char> list = new List<char>{
'\0', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\a', '\b', '\t',
'\n', '\v', '\f', '\r', '\u000e', '\u000f', '\u0010', '\u0011', '\u0012', '\u0013',
'\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b', '\u001c', '\u001d',
'\u001e', '\u001f', ' ', '/'
};
return text.Trim(list.ToArray()).Any();
}
}
}

View File

@ -1,72 +1,56 @@
using AIProofread.Controls;
using Microsoft.Office.Interop.Word;
using Microsoft.Office.Interop.Word;
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Diagnostics;
using System.Security.Cryptography;
using System.Security.Policy;
using System.Text;
using System.Windows.Forms;
using static System.Net.Mime.MediaTypeNames;
namespace AIProofread
{
public class Tools
{
private static readonly string[] paragSplitor = new string[] { "\r", "\n", "\r\n" };
public static Dictionary<string, object> GetAllText()
public static Dictionary<string, object> GetAllText(Document doc)
{
var doc = Globals.ThisAddIn.Application.ActiveDocument;
// 获取当前文档所有文本
string allText = doc.Range().Text;
List<DocumentText> list = new List<DocumentText>();
// // 开始分割
MD5 md5 = new MD5CryptoServiceProvider();
//if (allText != null && allText.Trim().Length > 0)
//{
// List<string> lines = allText.Split(paragSplitor, StringSplitOptions.None).ToList();//StringUtil.CutTextToSentences(allText);
// foreach (string text in lines)
// {
// if(text.Trim().Length > 0)
// {
// byte[] hash = md5.ComputeHash(Encoding.Default.GetBytes(text));
// list.Add(new DocumentText(hash, text + "\n"));
// }
// else
// {
// list.Add(new DocumentText(text + "\n"));
// }
// }
//}
Paragraphs paragraphs = doc.Paragraphs;
int total = paragraphs.Count;
//FormLoading frm = new FormLoading();
//frm.Show();
for (int paragraphNumber = 1; paragraphNumber <= total; paragraphNumber++)
int paragraphNumber = 0;
foreach (Paragraph p in paragraphs)
{
Paragraph p = paragraphs[paragraphNumber];
paragraphNumber++;
if (paragraphNumber % 20 == 0)
{
Debug.WriteLine("process paragraphNumber{0}", paragraphNumber);
}
Range r = p.Range;
if (r.ListFormat.ListType == WdListType.wdListPictureBullet
|| r.Tables.Count > 0
|| p.Range.Text.Trim().Length == 0)
string text = p.Range.Text;
if (text.Trim().Length == 0 || text.EndsWith("\r\a") || r.Tables.Count > 0)
{
continue;
}
string text = p.Range.Text;
//Logger.LogToWeb(string.Format("get paragraph {0}", paragraphNumber));
//frm.SetLoadingText(text);
if (text.Trim().Length > 0)
{
//byte[] hash = md5.ComputeHash(Encoding.Default.GetBytes(text));
list.Add(new DocumentText(text,paragraphNumber));
}
list.Add(new DocumentText(text.Replace("\u0002", ""), paragraphNumber));
}
//for (; paragraphNumber <= total; paragraphNumber++)
//{
// Paragraph p = paragraphs[paragraphNumber];
// //Logger.LogToWeb(string.Format("get paragraph {0}", paragraphNumber));
// //frm.SetLoadingText(text);
// //if (text.Trim().Length > 0)
// //{
// // //byte[] hash = md5.ComputeHash(Encoding.Default.GetBytes(text));
// //}
//}
//frm.Close();
var map = new Dictionary<string, object>
{
@ -76,6 +60,36 @@ namespace AIProofread
return map;
}
public static List<DocumentText> GetTextListByParagraphRange(int start, int end)
{
List<DocumentText> list = new List<DocumentText>();
var doc = Globals.ThisAddIn.Application.ActiveDocument;
Paragraphs paragraphs = doc.Paragraphs;
int total = paragraphs.Count;
if (start > total) return list;
start = Math.Max(start, 1);
end = Math.Min(end, total);
for (int paragraphNumber = start; paragraphNumber <= end; paragraphNumber++)
{
Paragraph p = paragraphs[paragraphNumber];
Range r = p.Range;
if (r.Tables.Count > 0
|| r.Endnotes.Count > 0
|| r.Footnotes.Count > 0
|| r.ListFormat.ListType != WdListType.wdListNoNumbering)
{
continue;
}
string text = p.Range.Text;
if (text.Trim().Length > 0)
{
list.Add(new DocumentText(text, paragraphNumber));
}
}
return list;
}
public static string GetJSONString(object data)
{
return JsonConvert.SerializeObject(data, Formatting.Indented);