2024-03-28 19:22:43 +08:00

77 lines
2.6 KiB
C#

using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace AIProofread
{
public class StringUtil
{
/// <summary>
/// 分句
/// </summary>
/// <param name="para"></param>
/// <returns></returns>
static string[] CutParagraphSentences(string para)
{
para = Regex.Replace(para, @"([。!?\?])([^”’])", "$1\x01$2", RegexOptions.Multiline);
para = Regex.Replace(para, @"(\.{6})([^”’])", "$1\x01$2", RegexOptions.Multiline);
para = Regex.Replace(para, @"(\…{2})([^”’])", "$1\x01$2", RegexOptions.Multiline);
para = Regex.Replace(para, @"([。!?\?][”’])([^,。!?\?])", "$1\x01$2", RegexOptions.Multiline);
//para = para.TrimEnd('\n');
return para.Split(new char[] { '\x01' }, StringSplitOptions.None);
}
static List<string> PreProcessList(string[] paragraphs)
{
List<string> list = new List<string>();
foreach (var text in paragraphs)
{
if (text.Length == 0)
{
if (list.Count > 0) list[list.Count - 1] += "\n";
continue;
}
list.Add(text + "\n");
}
return list;
}
static List<string> AfterProcessList(string[] paragraphs)
{
List<string> list = new List<string>();
foreach (var text in paragraphs)
{
if (Regex.Match(text, "^\n$").Success)
{
list[list.Count - 1] += "\n";
continue;
}
list.Add(text);
}
return list;
}
/// <summary>
/// 文本进行分句
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static List<string> CutTextToSentences(string text)
{
List<string> result = new List<string>();
var paragSplitor = new string[] { "\r", "\n", "\r\n" };
// 先进行分段 方便后续将换行符放入到当前段落的最后一句
var paragraphs = PreProcessList(text.Split(paragSplitor, StringSplitOptions.None));
foreach (var paragraph in paragraphs)
{
// 分句
var list = CutParagraphSentences(paragraph);
// 将换行符放入到当前段落的最后一句
result.AddRange(AfterProcessList(list));
}
return result;
}
}
}