77 lines
2.6 KiB
C#
77 lines
2.6 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Text.RegularExpressions;
|
|
|
|
namespace AIProofread
|
|
{
|
|
public class StringUtil
|
|
{
|
|
|
|
/// <summary>
|
|
/// 分句
|
|
/// </summary>
|
|
/// <param name="para"></param>
|
|
/// <returns></returns>
|
|
static string[] CutParagraphSentences(string para)
|
|
{
|
|
para = Regex.Replace(para, @"([。!?\?])([^”’])", "$1\x01$2", RegexOptions.Multiline);
|
|
para = Regex.Replace(para, @"(\.{6})([^”’])", "$1\x01$2", RegexOptions.Multiline);
|
|
para = Regex.Replace(para, @"(\…{2})([^”’])", "$1\x01$2", RegexOptions.Multiline);
|
|
para = Regex.Replace(para, @"([。!?\?][”’])([^,。!?\?])", "$1\x01$2", RegexOptions.Multiline);
|
|
//para = para.TrimEnd('\n');
|
|
return para.Split(new char[] { '\x01' }, StringSplitOptions.None);
|
|
}
|
|
|
|
static List<string> PreProcessList(string[] paragraphs)
|
|
{
|
|
List<string> list = new List<string>();
|
|
foreach (var text in paragraphs)
|
|
{
|
|
if (text.Length == 0)
|
|
{
|
|
if (list.Count > 0) list[list.Count - 1] += "\n";
|
|
continue;
|
|
}
|
|
list.Add(text + "\n");
|
|
}
|
|
return list;
|
|
}
|
|
static List<string> AfterProcessList(string[] paragraphs)
|
|
{
|
|
List<string> list = new List<string>();
|
|
foreach (var text in paragraphs)
|
|
{
|
|
if (Regex.Match(text, "^\n$").Success)
|
|
{
|
|
list[list.Count - 1] += "\n";
|
|
continue;
|
|
}
|
|
list.Add(text);
|
|
}
|
|
return list;
|
|
}
|
|
|
|
/// <summary>
|
|
/// 文本进行分句
|
|
/// </summary>
|
|
/// <param name="text"></param>
|
|
/// <returns></returns>
|
|
public static List<string> CutTextToSentences(string text)
|
|
{
|
|
List<string> result = new List<string>();
|
|
var paragSplitor = new string[] { "\r", "\n", "\r\n" };
|
|
// 先进行分段 方便后续将换行符放入到当前段落的最后一句
|
|
var paragraphs = PreProcessList(text.Split(paragSplitor, StringSplitOptions.None));
|
|
foreach (var paragraph in paragraphs)
|
|
{
|
|
// 分句
|
|
var list = CutParagraphSentences(paragraph);
|
|
// 将换行符放入到当前段落的最后一句
|
|
result.AddRange(AfterProcessList(list));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
}
|
|
}
|