using System; using System.Collections.Generic; using System.Text.RegularExpressions; namespace AIProofread { public class StringUtil { /// /// 分句 /// /// /// static string[] CutParagraphSentences(string para) { para = Regex.Replace(para, @"([。!?\?])([^”’])", "$1\x01$2", RegexOptions.Multiline); para = Regex.Replace(para, @"(\.{6})([^”’])", "$1\x01$2", RegexOptions.Multiline); para = Regex.Replace(para, @"(\…{2})([^”’])", "$1\x01$2", RegexOptions.Multiline); para = Regex.Replace(para, @"([。!?\?][”’])([^,。!?\?])", "$1\x01$2", RegexOptions.Multiline); //para = para.TrimEnd('\n'); return para.Split(new char[] { '\x01' }, StringSplitOptions.None); } static List PreProcessList(string[] paragraphs) { List list = new List(); foreach (var text in paragraphs) { if (text.Length == 0) { if (list.Count > 0) list[list.Count - 1] += "\n"; continue; } list.Add(text + "\n"); } return list; } static List AfterProcessList(string[] paragraphs) { List list = new List(); foreach (var text in paragraphs) { if (Regex.Match(text, "^\n$").Success) { list[list.Count - 1] += "\n"; continue; } list.Add(text); } return list; } /// /// 文本进行分句 /// /// /// public static List CutTextToSentences(string text) { List result = new List(); var paragSplitor = new string[] { "\r", "\n", "\r\n" }; // 先进行分段 方便后续将换行符放入到当前段落的最后一句 var paragraphs = PreProcessList(text.Split(paragSplitor, StringSplitOptions.None)); foreach (var paragraph in paragraphs) { // 分句 var list = CutParagraphSentences(paragraph); // 将换行符放入到当前段落的最后一句 result.AddRange(AfterProcessList(list)); } return result; } } }