from .Config import * def LimitStringLength(data, max_length:int=50) -> str: s:str = data if isinstance(data, str) else str(data) if len(s) <= max_length: return s else: inside_str = "\n...\n...\n" # 计算头尾部分的长度 head_length = max_length // 2 tail_length = max_length - head_length - len(inside_str) # 3 是省略号的长度 # 截取头尾部分并连接 return s[:head_length] + inside_str + s[-tail_length:] def FillString(data:Any, max_length: int = 50, fill_char: str = " ", side: Literal["left", "right", "center"] = "right" ) -> str: s:str = data if isinstance(data, str) else str(data) char = fill_char[0] if len(s) >= max_length: return s else: if side == "left": return s + char * (max_length - len(s)) elif side == "right": return char * (max_length - len(s)) + s elif side == "center": left = (max_length - len(s)) // 2 right = max_length - len(s) - left return char * left + s + char * right else: raise ValueError(f"Unsupported side: {side}") def Bytes2Strings(lines:List[bytes], encoding='utf-8') -> List[str]: return [line.decode(encoding) for line in lines] def Bytes2String(lines:List[bytes], encoding='utf-8') -> str: return "".join(Bytes2Strings(lines, encoding)) def word_segmentation( sentence, cut_all: bool = False, HMM: bool = True, use_paddle: bool = False ) -> Sequence[Optional[Union[Any, str]]]: try: import jieba return jieba.dt.cut(str(sentence), cut_all=cut_all, HMM=HMM, use_paddle=use_paddle) except ImportError: raise ValueError("jieba is not install") def GetEditorDistanceAndOperations( s1:str, s2:str, ) -> Tuple[int, List[Tuple[Literal["add","delete"], int, int, str]]]: """ 计算两个字符串的编辑距离和操作序列 操作格式: (操作类型, 开始位置, 结束位置, 内容) 位置基于源字符串s1 """ m, n = len(s1), len(s2) # 使用简单的LCS算法来找到最长公共子序列 # 然后基于LCS生成操作序列 lcs = [[0] * (n + 1) for _ in range(m + 1)] # 构建LCS表 for i in range(1, m + 1): for j in range(1, n + 1): if s1[i - 1] == s2[j - 1]: lcs[i][j] = lcs[i - 1][j - 1] + 1 else: lcs[i][j] = max(lcs[i - 1][j], lcs[i][j - 1]) # 基于LCS生成操作序列 operations = [] i, j = m, n while i > 0 or j > 0: if i > 0 and j > 0 and s1[i - 1] == s2[j - 1]: # 字符匹配,不需要操作 i -= 1 j -= 1 elif j > 0 and (i == 0 or lcs[i][j - 1] >= lcs[i - 1][j]): # 需要插入s2[j-1] # 找到插入位置(在s1中的位置) insert_pos = i operations.insert(0, ("add", insert_pos, insert_pos, s2[j - 1])) j -= 1 else: # 需要删除s1[i-1] operations.insert(0, ("delete", i - 1, i, s1[i - 1])) i -= 1 # 合并连续的操作 merged_operations = [] for op in operations: if merged_operations and merged_operations[-1][0] == op[0]: last_op = merged_operations[-1] if op[0] == "add" and last_op[2] == op[1]: # 合并连续的添加操作 merged_operations[-1] = (op[0], last_op[1], op[2], last_op[3] + op[3]) elif op[0] == "delete" and last_op[2] == op[1]: # 合并连续的删除操作 merged_operations[-1] = (op[0], last_op[1], op[2], last_op[3] + op[3]) else: merged_operations.append(op) else: merged_operations.append(op) # 计算编辑距离 edit_distance = m + n - 2 * lcs[m][n] return edit_distance, merged_operations def _build_line_lcs(lines1: List[str], lines2: List[str]) -> List[List[int]]: """ 构建行级LCS动态规划表 """ m, n = len(lines1), len(lines2) lcs = [[0] * (n + 1) for _ in range(m + 1)] # 使用哈希加速行比较 hash1 = [hash(line) for line in lines1] hash2 = [hash(line) for line in lines2] for i in range(1, m + 1): for j in range(1, n + 1): if hash1[i-1] == hash2[j-1] and lines1[i-1] == lines2[j-1]: lcs[i][j] = lcs[i-1][j-1] + 1 else: lcs[i][j] = max(lcs[i-1][j], lcs[i][j-1]) return lcs def _extract_line_operations(lines1: List[str], lines2: List[str], lcs: List[List[int]]) -> List[Tuple[str, int, int, List[str]]]: """ 从LCS表提取行级操作序列 返回: (操作类型, 起始行号, 结束行号, 行内容列表) """ operations = [] m, n = len(lines1), len(lines2) i, j = m, n while i > 0 or j > 0: if i > 0 and j > 0 and lines1[i-1] == lines2[j-1]: i -= 1 j -= 1 elif j > 0 and (i == 0 or lcs[i][j-1] >= lcs[i-1][j]): operations.insert(0, ("add", i, i, [lines2[j-1]])) j -= 1 else: operations.insert(0, ("delete", i-1, i, [lines1[i-1]])) i -= 1 # 合并连续的同类行操作 merged = [] for op_type, start, end, lines in operations: if merged and merged[-1][0] == op_type and merged[-1][2] == start: merged[-1] = (op_type, merged[-1][1], end, merged[-1][3] + lines) else: merged.append((op_type, start, end, lines)) return merged def _char_diff_in_region(s1: str, s2: str) -> List[Tuple[str, int, int, str]]: """ 对小范围区域进行字符级LCS比较 返回相对于输入字符串的位置 """ m, n = len(s1), len(s2) # 快速路径 if m == 0 and n == 0: return [] if m == 0: return [("add", 0, 0, s2)] if n == 0: return [("delete", 0, m, s1)] if s1 == s2: return [] # 字符级LCS lcs = [[0] * (n + 1) for _ in range(m + 1)] for i in range(1, m + 1): for j in range(1, n + 1): if s1[i-1] == s2[j-1]: lcs[i][j] = lcs[i-1][j-1] + 1 else: lcs[i][j] = max(lcs[i-1][j], lcs[i][j-1]) # 回溯生成操作 operations = [] i, j = m, n while i > 0 or j > 0: if i > 0 and j > 0 and s1[i-1] == s2[j-1]: i -= 1 j -= 1 elif j > 0 and (i == 0 or lcs[i][j-1] >= lcs[i-1][j]): operations.insert(0, ("add", i, i, s2[j-1])) j -= 1 else: operations.insert(0, ("delete", i-1, i, s1[i-1])) i -= 1 # 合并连续操作 merged = [] for op_type, start, end, content in operations: if merged and merged[-1][0] == op_type: last_op = merged[-1] if op_type == "add" and last_op[2] == start: merged[-1] = (op_type, last_op[1], end, last_op[3] + content) elif op_type == "delete" and last_op[2] == start: merged[-1] = (op_type, last_op[1], end, last_op[3] + content) else: merged.append((op_type, start, end, content)) else: merged.append((op_type, start, end, content)) return merged def GetDiffOperations( s1:str, s2:str, ) -> List[Tuple[Literal["add","delete"], int, int, str]]: """ 计算两个字符串的差异操作序列(混合行级+字符级算法) 操作格式: (操作类型, 开始位置, 结束位置, 内容) 位置基于源字符串s1的字符偏移 """ # 快速路径 if s1 == s2: return [] if not s1: return [("add", 0, 0, s2)] if not s2: return [("delete", 0, len(s1), s1)] # 阶段1: 分行并建立位置映射 lines1 = s1.split('\n') lines2 = s2.split('\n') # 构建行号到字符位置的映射 line_offsets_s1 = [0] for line in lines1[:-1]: line_offsets_s1.append(line_offsets_s1[-1] + len(line) + 1) # +1 for '\n' line_offsets_s2 = [0] for line in lines2[:-1]: line_offsets_s2.append(line_offsets_s2[-1] + len(line) + 1) # 阶段2: 行级LCS分析 lcs = _build_line_lcs(lines1, lines2) line_operations = _extract_line_operations(lines1, lines2, lcs) # 阶段3: 转换为字符级操作 final_operations = [] for op_type, start_line, end_line, op_lines in line_operations: if op_type == "add": # 添加操作: 在s1的start_line位置插入 char_pos = line_offsets_s1[start_line] if start_line < len(line_offsets_s1) else len(s1) content = '\n'.join(op_lines) # 对于添加的行块,可以选择字符级细化或直接使用 # 这里先直接使用行级结果 final_operations.append(("add", char_pos, char_pos, content)) elif op_type == "delete": # 删除操作: 删除s1的[start_line, end_line)行 char_start = line_offsets_s1[start_line] if end_line < len(lines1): char_end = line_offsets_s1[end_line] else: char_end = len(s1) content = '\n'.join(op_lines) final_operations.append(("delete", char_start, char_end, content)) # 阶段4: 对于连续的删除+添加,尝试字符级精细比较 optimized_operations = [] i = 0 while i < len(final_operations): if (i + 1 < len(final_operations) and final_operations[i][0] == "delete" and final_operations[i+1][0] == "add" and final_operations[i][2] == final_operations[i+1][1]): # 这是一个修改操作,进行字符级细化 del_op = final_operations[i] add_op = final_operations[i+1] old_text = del_op[3] new_text = add_op[3] base_pos = del_op[1] # 字符级比较 char_ops = _char_diff_in_region(old_text, new_text) # 调整位置到全局坐标 for op_type, rel_start, rel_end, content in char_ops: optimized_operations.append((op_type, base_pos + rel_start, base_pos + rel_end, content)) i += 2 else: optimized_operations.append(final_operations[i]) i += 1 return optimized_operations