新增一个经过优化的支持大文本获取编辑操作的函数
This commit is contained in:
@@ -115,14 +115,199 @@ def GetEditorDistanceAndOperations(
|
|||||||
edit_distance = m + n - 2 * lcs[m][n]
|
edit_distance = m + n - 2 * lcs[m][n]
|
||||||
return edit_distance, merged_operations
|
return edit_distance, merged_operations
|
||||||
|
|
||||||
|
def _build_line_lcs(lines1: List[str], lines2: List[str]) -> List[List[int]]:
|
||||||
|
"""
|
||||||
|
构建行级LCS动态规划表
|
||||||
|
"""
|
||||||
|
m, n = len(lines1), len(lines2)
|
||||||
|
lcs = [[0] * (n + 1) for _ in range(m + 1)]
|
||||||
|
|
||||||
|
# 使用哈希加速行比较
|
||||||
|
hash1 = [hash(line) for line in lines1]
|
||||||
|
hash2 = [hash(line) for line in lines2]
|
||||||
|
|
||||||
|
for i in range(1, m + 1):
|
||||||
|
for j in range(1, n + 1):
|
||||||
|
if hash1[i-1] == hash2[j-1] and lines1[i-1] == lines2[j-1]:
|
||||||
|
lcs[i][j] = lcs[i-1][j-1] + 1
|
||||||
|
else:
|
||||||
|
lcs[i][j] = max(lcs[i-1][j], lcs[i][j-1])
|
||||||
|
|
||||||
|
return lcs
|
||||||
|
|
||||||
|
def _extract_line_operations(lines1: List[str], lines2: List[str], lcs: List[List[int]]) -> List[Tuple[str, int, int, List[str]]]:
|
||||||
|
"""
|
||||||
|
从LCS表提取行级操作序列
|
||||||
|
返回: (操作类型, 起始行号, 结束行号, 行内容列表)
|
||||||
|
"""
|
||||||
|
operations = []
|
||||||
|
m, n = len(lines1), len(lines2)
|
||||||
|
i, j = m, n
|
||||||
|
|
||||||
|
while i > 0 or j > 0:
|
||||||
|
if i > 0 and j > 0 and lines1[i-1] == lines2[j-1]:
|
||||||
|
i -= 1
|
||||||
|
j -= 1
|
||||||
|
elif j > 0 and (i == 0 or lcs[i][j-1] >= lcs[i-1][j]):
|
||||||
|
operations.insert(0, ("add", i, i, [lines2[j-1]]))
|
||||||
|
j -= 1
|
||||||
|
else:
|
||||||
|
operations.insert(0, ("delete", i-1, i, [lines1[i-1]]))
|
||||||
|
i -= 1
|
||||||
|
|
||||||
|
# 合并连续的同类行操作
|
||||||
|
merged = []
|
||||||
|
for op_type, start, end, lines in operations:
|
||||||
|
if merged and merged[-1][0] == op_type and merged[-1][2] == start:
|
||||||
|
merged[-1] = (op_type, merged[-1][1], end, merged[-1][3] + lines)
|
||||||
|
else:
|
||||||
|
merged.append((op_type, start, end, lines))
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
def _char_diff_in_region(s1: str, s2: str) -> List[Tuple[str, int, int, str]]:
|
||||||
|
"""
|
||||||
|
对小范围区域进行字符级LCS比较
|
||||||
|
返回相对于输入字符串的位置
|
||||||
|
"""
|
||||||
|
m, n = len(s1), len(s2)
|
||||||
|
|
||||||
|
# 快速路径
|
||||||
|
if m == 0 and n == 0:
|
||||||
|
return []
|
||||||
|
if m == 0:
|
||||||
|
return [("add", 0, 0, s2)]
|
||||||
|
if n == 0:
|
||||||
|
return [("delete", 0, m, s1)]
|
||||||
|
if s1 == s2:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 字符级LCS
|
||||||
|
lcs = [[0] * (n + 1) for _ in range(m + 1)]
|
||||||
|
|
||||||
|
for i in range(1, m + 1):
|
||||||
|
for j in range(1, n + 1):
|
||||||
|
if s1[i-1] == s2[j-1]:
|
||||||
|
lcs[i][j] = lcs[i-1][j-1] + 1
|
||||||
|
else:
|
||||||
|
lcs[i][j] = max(lcs[i-1][j], lcs[i][j-1])
|
||||||
|
|
||||||
|
# 回溯生成操作
|
||||||
|
operations = []
|
||||||
|
i, j = m, n
|
||||||
|
|
||||||
|
while i > 0 or j > 0:
|
||||||
|
if i > 0 and j > 0 and s1[i-1] == s2[j-1]:
|
||||||
|
i -= 1
|
||||||
|
j -= 1
|
||||||
|
elif j > 0 and (i == 0 or lcs[i][j-1] >= lcs[i-1][j]):
|
||||||
|
operations.insert(0, ("add", i, i, s2[j-1]))
|
||||||
|
j -= 1
|
||||||
|
else:
|
||||||
|
operations.insert(0, ("delete", i-1, i, s1[i-1]))
|
||||||
|
i -= 1
|
||||||
|
|
||||||
|
# 合并连续操作
|
||||||
|
merged = []
|
||||||
|
for op_type, start, end, content in operations:
|
||||||
|
if merged and merged[-1][0] == op_type:
|
||||||
|
last_op = merged[-1]
|
||||||
|
if op_type == "add" and last_op[2] == start:
|
||||||
|
merged[-1] = (op_type, last_op[1], end, last_op[3] + content)
|
||||||
|
elif op_type == "delete" and last_op[2] == start:
|
||||||
|
merged[-1] = (op_type, last_op[1], end, last_op[3] + content)
|
||||||
|
else:
|
||||||
|
merged.append((op_type, start, end, content))
|
||||||
|
else:
|
||||||
|
merged.append((op_type, start, end, content))
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
def GetDiffOperations(
|
def GetDiffOperations(
|
||||||
s1:str,
|
s1:str,
|
||||||
s2:str,
|
s2:str,
|
||||||
) -> List[Tuple[Literal["add","delete"], int, int, str]]:
|
) -> List[Tuple[Literal["add","delete"], int, int, str]]:
|
||||||
"""
|
"""
|
||||||
计算两个字符串的差异操作序列
|
计算两个字符串的差异操作序列(混合行级+字符级算法)
|
||||||
操作格式: (操作类型, 开始位置, 结束位置, 内容)
|
操作格式: (操作类型, 开始位置, 结束位置, 内容)
|
||||||
位置基于源字符串s1
|
位置基于源字符串s1的字符偏移
|
||||||
"""
|
"""
|
||||||
|
# 快速路径
|
||||||
|
if s1 == s2:
|
||||||
|
return []
|
||||||
|
if not s1:
|
||||||
|
return [("add", 0, 0, s2)]
|
||||||
|
if not s2:
|
||||||
|
return [("delete", 0, len(s1), s1)]
|
||||||
|
|
||||||
return operations
|
# 阶段1: 分行并建立位置映射
|
||||||
|
lines1 = s1.split('\n')
|
||||||
|
lines2 = s2.split('\n')
|
||||||
|
|
||||||
|
# 构建行号到字符位置的映射
|
||||||
|
line_offsets_s1 = [0]
|
||||||
|
for line in lines1[:-1]:
|
||||||
|
line_offsets_s1.append(line_offsets_s1[-1] + len(line) + 1) # +1 for '\n'
|
||||||
|
|
||||||
|
line_offsets_s2 = [0]
|
||||||
|
for line in lines2[:-1]:
|
||||||
|
line_offsets_s2.append(line_offsets_s2[-1] + len(line) + 1)
|
||||||
|
|
||||||
|
# 阶段2: 行级LCS分析
|
||||||
|
lcs = _build_line_lcs(lines1, lines2)
|
||||||
|
line_operations = _extract_line_operations(lines1, lines2, lcs)
|
||||||
|
|
||||||
|
# 阶段3: 转换为字符级操作
|
||||||
|
final_operations = []
|
||||||
|
|
||||||
|
for op_type, start_line, end_line, op_lines in line_operations:
|
||||||
|
if op_type == "add":
|
||||||
|
# 添加操作: 在s1的start_line位置插入
|
||||||
|
char_pos = line_offsets_s1[start_line] if start_line < len(line_offsets_s1) else len(s1)
|
||||||
|
content = '\n'.join(op_lines)
|
||||||
|
|
||||||
|
# 对于添加的行块,可以选择字符级细化或直接使用
|
||||||
|
# 这里先直接使用行级结果
|
||||||
|
final_operations.append(("add", char_pos, char_pos, content))
|
||||||
|
|
||||||
|
elif op_type == "delete":
|
||||||
|
# 删除操作: 删除s1的[start_line, end_line)行
|
||||||
|
char_start = line_offsets_s1[start_line]
|
||||||
|
if end_line < len(lines1):
|
||||||
|
char_end = line_offsets_s1[end_line]
|
||||||
|
else:
|
||||||
|
char_end = len(s1)
|
||||||
|
|
||||||
|
content = '\n'.join(op_lines)
|
||||||
|
final_operations.append(("delete", char_start, char_end, content))
|
||||||
|
|
||||||
|
# 阶段4: 对于连续的删除+添加,尝试字符级精细比较
|
||||||
|
optimized_operations = []
|
||||||
|
i = 0
|
||||||
|
while i < len(final_operations):
|
||||||
|
if (i + 1 < len(final_operations) and
|
||||||
|
final_operations[i][0] == "delete" and
|
||||||
|
final_operations[i+1][0] == "add" and
|
||||||
|
final_operations[i][2] == final_operations[i+1][1]):
|
||||||
|
|
||||||
|
# 这是一个修改操作,进行字符级细化
|
||||||
|
del_op = final_operations[i]
|
||||||
|
add_op = final_operations[i+1]
|
||||||
|
|
||||||
|
old_text = del_op[3]
|
||||||
|
new_text = add_op[3]
|
||||||
|
base_pos = del_op[1]
|
||||||
|
|
||||||
|
# 字符级比较
|
||||||
|
char_ops = _char_diff_in_region(old_text, new_text)
|
||||||
|
|
||||||
|
# 调整位置到全局坐标
|
||||||
|
for op_type, rel_start, rel_end, content in char_ops:
|
||||||
|
optimized_operations.append((op_type, base_pos + rel_start, base_pos + rel_end, content))
|
||||||
|
|
||||||
|
i += 2
|
||||||
|
else:
|
||||||
|
optimized_operations.append(final_operations[i])
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return optimized_operations
|
||||||
Reference in New Issue
Block a user