using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace AIProofread
{
public class StringUtil
{
///
/// 分句
///
///
///
static string[] CutParagraphSentences(string para)
{
para = Regex.Replace(para, @"([。!?\?])([^”’])", "$1\x01$2", RegexOptions.Multiline);
para = Regex.Replace(para, @"(\.{6})([^”’])", "$1\x01$2", RegexOptions.Multiline);
para = Regex.Replace(para, @"(\…{2})([^”’])", "$1\x01$2", RegexOptions.Multiline);
para = Regex.Replace(para, @"([。!?\?][”’])([^,。!?\?])", "$1\x01$2", RegexOptions.Multiline);
//para = para.TrimEnd('\n');
return para.Split(new char[] { '\x01' }, StringSplitOptions.None);
}
static List PreProcessList(string[] paragraphs)
{
List list = new List();
foreach (var text in paragraphs)
{
if (text.Length == 0)
{
if (list.Count > 0) list[list.Count - 1] += "\n";
continue;
}
list.Add(text + "\n");
}
return list;
}
static List AfterProcessList(string[] paragraphs)
{
List list = new List();
foreach (var text in paragraphs)
{
if (Regex.Match(text, "^\n$").Success)
{
list[list.Count - 1] += "\n";
continue;
}
list.Add(text);
}
return list;
}
///
/// 文本进行分句
///
///
///
public static List CutTextToSentences(string text)
{
List result = new List();
var paragSplitor = new string[] { "\r", "\n", "\r\n" };
// 先进行分段 方便后续将换行符放入到当前段落的最后一句
var paragraphs = PreProcessList(text.Split(paragSplitor, StringSplitOptions.None));
foreach (var paragraph in paragraphs)
{
// 分句
var list = CutParagraphSentences(paragraph);
// 将换行符放入到当前段落的最后一句
result.AddRange(AfterProcessList(list));
}
return result;
}
}
}