一个数据清洗去重的脚本

五折 · 发表于昨天 19:09

在做Qwen3指令监督微调时，其中一份生成式数据集存在大量重复文字、词语、短语、句式
于是让AI写了个去重脚本：

重复内容有多种表现形式：

字符级重复：啊啊啊啊、数据数据数据
词语级重复：重要重要重要、测试测试测试
短句级重复：这是一个测试。这是一个测试。这是一个测试。
句式级重复：重要的是...、首先...、步骤一...

单一层次的检测无法覆盖所有情况，需要综合多种检测方法。

参数:
- min_length: n-gram最小长度
- max_length: n-gram最大长度
- min_repeats: n-gram最少重复次数
- min_ratio: 重复n-gram占总内容比例阈值

脚本如下：

import json
import re
from collections import Counter
import time
import sys
def has_high_char_repetition(text, min_repeats=4, max_ratio=0.1):
"""
检测连续单字重复（如"啊啊啊"）
"""
if not text:
return False
# 提取所有中文字符
chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
if not chinese_chars:
return False
total_chars = len(chinese_chars)
# 检测连续重复字符
repeat_count = 0
i = 0
while i < len(chinese_chars) - min_repeats + 1:
current_char = chinese_chars[i]
repeat_len = 1
# 检查连续重复
while i + repeat_len < len(chinese_chars) and chinese_chars[i + repeat_len] == current_char:
repeat_len += 1
# 如果重复达到阈值
if repeat_len >= min_repeats:
repeat_count += repeat_len
i += repeat_len
else:
i += 1
return repeat_count / total_chars >= max_ratio
def has_high_symbol_letter_repetition(text, min_repeats=8, max_ratio=0.05):
"""
检测符号和字母的重复（大小写敏感）
优化点：
- 专门处理字母(大小写)和符号重复
- 提高重复阈值避免误伤正常内容
- 分别检测连续重复和周期性重复
参数:
- min_repeats: 最小重复次数（提高到8避免误伤）
- max_ratio: 重复内容占比阈值（降低到5%）
返回:
- bool: 是否存在高重复符号/字母
"""
if not text:
return False
# 提取所有非中文字符（字母、数字、符号）
non_chinese = re.findall(r'[^\u4e00-\u9fff]', text)
if not non_chinese or len(non_chinese) < min_repeats * 2:
return False
total_non_chinese = len(non_chinese)
# 检测1: 连续相同字符重复 (如 "aaaaaaaa")
consecutive_repeats = 0
i = 0
while i < len(non_chinese) - min_repeats + 1:
current_char = non_chinese[i]
repeat_len = 1
while i + repeat_len < len(non_chinese) and non_chinese[i + repeat_len] == current_char:
repeat_len += 1
if repeat_len >= min_repeats:
consecutive_repeats += repeat_len
i += repeat_len
else:
i += 1
if consecutive_repeats / total_non_chinese >= max_ratio:
return True
# 检测2: 周期性重复 (如 "abababab")
# 只检查长度为2-5的周期模式
for pattern_len in [2, 3, 4, 5]:
if total_non_chinese < pattern_len * min_repeats:
continue
i = 0
periodic_repeats = 0
while i <= len(non_chinese) - pattern_len * min_repeats:
pattern = non_chinese[i:i+pattern_len]
repeat_len = 1
# 检查周期性重复
while (i + pattern_len * (repeat_len + 1) <= len(non_chinese) and
non_chinese[i + pattern_len * repeat_len:i + pattern_len * (repeat_len + 1)] == pattern):
repeat_len += 1
# 如果周期重复达到阈值
if repeat_len >= min_repeats:
# 计算重复字符数（排除第一次出现）
periodic_repeats += pattern_len * (repeat_len - 1)
i += pattern_len * repeat_len
else:
i += 1
if periodic_repeats / total_non_chinese >= max_ratio:
return True
return False
def has_high_word_repetition(text, min_word_length=2, max_word_length=10,
min_repeats=2, max_ratio=0.08):
"""
检测连续词语重复（优化版）
"""
if not text:
return False
# 提取所有中文字符序列
chinese_text = ''.join(re.findall(r'[\u4e00-\u9fff]', text))
if len(chinese_text) < min_word_length * min_repeats:
return False
total_length = len(chinese_text)
if total_length == 0:
return False
# 优化：只检查特定长度的词语（2, 4, 6, 8, 10字）
word_lengths = [2, 4, 6, 8, 10]
max_repeat_ratio = 0
for word_len in word_lengths:
if word_len > max_word_length or word_len > total_length // min_repeats:
continue
i = 0
repeat_chars = 0
while i <= len(chinese_text) - word_len * min_repeats:
word = chinese_text[i:i+word_len]
repeat_len = 1
# 检查连续重复
while (i + word_len * (repeat_len + 1) <= len(chinese_text) and
chinese_text[i + word_len * repeat_len:i + word_len * (repeat_len + 1)] == word):
repeat_len += 1
# 如果重复达到阈值
if repeat_len >= min_repeats:
repeat_chars += word_len * (repeat_len - 1)
i += word_len * repeat_len
else:
i += 1
# 计算该长度词语的重复比例
if repeat_chars > 0:
repeat_ratio = repeat_chars / total_length
max_repeat_ratio = max(max_repeat_ratio, repeat_ratio)
return max_repeat_ratio >= max_ratio
def has_high_short_sentence_repetition(text, min_length=5, min_repeats=5, min_ratio=0.1):
"""
检测短句重复（优化版）
"""
if not text or len(text) < min_length * min_repeats:
return False
# 分割句子（使用中文标点）
sentences = re.split(r'[。！？；…\n]', text)
# 过滤无效句子
sentences = [s.strip() for s in sentences if len(s.strip()) >= min_length]
if len(sentences) < min_repeats:
return False
# 特别优化：检测极高密度重复
sentence_counts = Counter(sentences)
for sentence, count in sentence_counts.most_common(1):
if count >= 8: # 只需重复8次就标记
return True
# 计算重复情况
total_length = sum(len(s) for s in sentences)
repeated_length = 0
for sentence, count in sentence_counts.items():
if count >= min_repeats:
repeated_length += len(sentence) * (count - 1)
return repeated_length / total_length >= min_ratio
def has_high_sentence_pattern_repetition(text,
min_repeated_patterns=4, # 提高到4避免误伤
pattern_threshold=0.3, # 提高到30%避免误伤
pattern_length=4,
max_common_patterns=1):
"""
检测句式重复（优化版 - 降低敏感度）
关键调整:
- min_repeated_patterns 从2提高到4
- pattern_threshold 从15%提高到30%
- 避免将正常叙述误判为重复
"""
if not text or len(text) < 50:
return False
# 分割句子
sentences = re.split(r'[。！？；…\n]', text)
# 过滤无效句子
sentences = [
s.strip()
for s in sentences
if len(s.strip()) > 8 and
not re.match(r'^[\d#*●\-]\.?[\s\*]*', s) and
len(s.strip()) < 100 # 排除超长句子（可能是代码）
]
if len(sentences) < 8: # 需要至少8个句子才检测（避免短文本误判）
return False
# 提取句子开头模式（跳过常见开头）
patterns = []
common_starts = {"首先", "其次", "然后", "最后", "另外", "不过", "但是", "因此", "所以",
"例如", "比如", "关于", "对于", "通过", "基于", "根据", "由于"}
for s in sentences:
clean_s = re.sub(r'^[^\u4e00-\u9fff]*', '', s)
if len(clean_s) >= pattern_length:
pattern = clean_s[:pattern_length]
# 跳过常见开头
if pattern not in common_starts:
patterns.append(pattern)
if not patterns or len(patterns) < 5:
return False
# 统计高频句式
pattern_counts = Counter(patterns)
total = len(patterns)
# 检查是否存在高频重复句式
for pattern, count in pattern_counts.most_common(1):
# 需要同时满足:
# 1. 重复次数 >= 4
# 2. 占比 >= 30%
# 3. 不是常见短语
if count >= min_repeated_patterns and count / total >= pattern_threshold:
return True
return False
def has_high_ngram_repetition(text, min_length=6, max_length=15, min_repeats=3, min_ratio=0.08):
"""
高性能n-gram重复检测
"""
if not text or len(text) < min_length * min_repeats:
return False
# 提取中文文本
chinese_text = ''.join(re.findall(r'[\u4e00-\u9fff]', text))
if len(chinese_text) < min_length * min_repeats:
return False
total_length = len(chinese_text)
if total_length < 20:
return False
# 优化：对长文本进行采样
sample_points = []
if total_length <= 500:
sample_points = [(0, total_length)]
else:
sample_size = min(300, total_length // 2)
sample_points = [
(0, sample_size),
(total_length // 2, sample_size),
(total_length - sample_size, sample_size)
]
# 只检查特定长度的n-gram
ngram_lengths = [6, 9, 12, 15]
for start, size in sample_points:
end = min(start + size, total_length)
if end <= start:
continue
sample_text = chinese_text[start:end]
for n in ngram_lengths:
if n > len(sample_text) // min_repeats:
continue
ngram_counts = {}
max_count = 0
for i in range(len(sample_text) - n + 1):
ngram = sample_text[i:i+n]
ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1
if ngram_counts[ngram] > max_count:
max_count = ngram_counts[ngram]
# 早期退出：重复8次以上直接标记
if max_count >= 8:
return True
# 检查是否有高重复n-gram
for count in ngram_counts.values():
if count >= min_repeats:
repeat_ratio = n * (count - 1) / total_length
if repeat_ratio >= min_ratio:
return True
return False
def is_highly_repetitive(text,
char_params=(4, 0.1),
symbol_letter_params=(8, 0.05), # 专门针对符号/字母
word_params=(2, 10, 2, 0.08),
short_sentence_params=(5, 5, 0.1),
pattern_params=(4, 0.3, 4, 1), # 降低句式敏感度
ngram_params=(6, 15, 3, 0.08)):
"""
综合检测文本是否高度重复
优化点:
1. 添加专门的符号/字母重复检测
2. 降低句式重复检测敏感度
3. 优化检测顺序
"""
# 1. 检查符号/字母重复（针对您的新问题）
if has_high_symbol_letter_repetition(text, *symbol_letter_params):
return True, "符号/字母重复"
# 2. 快速检查：极高密度短句重复
if has_high_short_sentence_repetition(text, *short_sentence_params):
return True, "短句重复"
# 3. 检查连续字符重复
if has_high_char_repetition(text, *char_params):
return True, "连续字符重复"
# 4. 检查连续词语重复
if has_high_word_repetition(text, *word_params):
return True, "连续词语重复"
# 5. 检查句式重复（降低敏感度后）
if has_high_sentence_pattern_repetition(text, *pattern_params):
return True, "句式重复"
# 6. 检查n-gram重复
if has_high_ngram_repetition(text, *ngram_params):
return True, "n-gram重复"
return False, "无显著重复"
def clean_json_file(input_path, output_path, batch_size=1000):
"""
高性能清理JSON文件
"""
# 检测文件格式
with open(input_path, 'r', encoding='utf-8') as f:
first_char = f.read(1)
f.seek(0)
is_json_lines = (first_char != '[')
# 读取数据
data = []
with open(input_path, 'r', encoding='utf-8') as f:
if is_json_lines:
for line in f:
line = line.strip()
if line:
try:
data.append(json.loads(line))
except json.JSONDecodeError:
continue
else:
data = json.load(f)
# 处理数据
original_count = len(data)
cleaned_data = []
removed_items = []
removal_reasons = Counter()
start_time = time.time()
print(f"开始处理 {original_count} 条数据...")
# 处理每个条目
for i, item in enumerate(data):
# 显示进度
if i % batch_size == 0 and i > 0:
elapsed = time.time() - start_time
items_per_sec = i / elapsed
remaining = (original_count - i) / items_per_sec
print(f" 处理进度: {i}/{original_count} ({i/original_count:.1%}), "
f"预计剩余时间: {remaining:.1f}秒")
# 合并所有文本字段
text_fields = [
item.get('instruction', ''),
item.get('input', ''),
item.get('output', '')
]
full_text = "\n".join(text_fields)
is_repetitive, reason = is_highly_repetitive(full_text)
if not is_repetitive:
cleaned_data.append(item)
else:
removed_items.append(item)
removal_reasons[reason] += 1
# 保存结果
with open(output_path, 'w', encoding='utf-8') as f:
if is_json_lines:
# 修复了这里：将 cleaned_ 改为 cleaned_data
for item in cleaned_data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
else:
json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
# 保存移除的条目
if removed_items:
removed_path = output_path.replace('.json', '_removed.json')
with open(removed_path, 'w', encoding='utf-8') as f:
if is_json_lines:
for item in removed_items:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
else:
json.dump(removed_items, f, ensure_ascii=False, indent=2)
# 打印统计信息
total_time = time.time() - start_time
print(f"\n清理完成！原始条目: {original_count}")
print(f"保留条目: {len(cleaned_data)} ({len(cleaned_data)/original_count:.1%})")
print(f"移除条目: {len(removed_items)} ({len(removed_items)/original_count:.1%})")
print(f"处理速度: {original_count/total_time:.1f} 条/秒")
if removed_items:
print("\n移除原因统计:")
for reason, count in removal_reasons.most_common():
print(f" - {reason}: {count} 条 ({count/len(removed_items):.1%})")
# 显示一个示例
print("\n重复内容示例分析:")
sample = removed_items[0]
text_fields = [
sample.get('instruction', ''),
sample.get('input', ''),
sample.get('output', '')
]
full_text = "\n".join(text_fields)
is_repetitive, reason = is_highly_repetitive(full_text)
print(f"检测到: {reason}")
# 根据重复类型显示具体重复内容
if reason == "符号/字母重复":
# 检测连续重复
consecutive = re.search(r'([^\u4e00-\u9fff])\1{7,}', full_text)
if consecutive:
char = consecutive.group(1)
print(f" 检测到连续符号/字母重复: '{char * 8}'")
# 检测周期性重复
if not consecutive:
for pattern_len in [2, 3, 4]:
pattern = r'([^\u4e00-\u9fff]{' + str(pattern_len) + r'})\1{3,}'
periodic = re.search(pattern, full_text)
if periodic:
print(f" 检测到周期性重复: '{periodic.group(1) * 4}'")
break
elif reason == "连续字符重复":
repeats = re.findall(r'([\u4e00-\u9fff])\1{3,}', full_text)
if repeats:
print(f" 检测到字符重复: '{repeats[0] * 4}'")
elif reason == "连续词语重复":
for word_len in range(2, 11):
pattern = r'([\u4e00-\u9fff]{' + str(word_len) + r'})\1{1,}'
matches = re.findall(pattern, full_text)
if matches:
print(f" 检测到词语重复: '{matches[0] * 2}'")
break
elif reason in ["短句重复", "n-gram重复"]:
chinese_text = ''.join(re.findall(r'[\u4e00-\u9fff]', full_text))
if len(chinese_text) > 20:
sample_text = chinese_text[:min(200, len(chinese_text))]
best_ngram = ""
best_count = 0
for n in range(5, 16):
ngrams = [sample_text[i:i+n] for i in range(len(sample_text) - n + 1)]
if ngrams:
counter = Counter(ngrams)
most_common, count = counter.most_common(1)[0]
if count > best_count:
best_count = count
best_ngram = most_common
if best_count >= 3:
print(f" 检测到高频重复片段: '{best_ngram}' 重复 {best_count} 次")
elif reason == "句式重复":
sentences = re.split(r'[。！？；…\n]', full_text)
sentences = [s.strip() for s in sentences if len(s.strip()) > 8]
patterns = []
for s in sentences:
clean_s = re.sub(r'^[^\u4e00-\u9fff]*', '', s)
if len(clean_s) >= 4:
patterns.append(clean_s[:4])
pattern_counts = Counter(patterns)
for pattern, count in pattern_counts.most_common(1):
if count >= 4: # 与检测阈值一致
print(f" 检测到句式重复: '{pattern}...' 重复 {count} 次")
return len(removed_items)
if __name__ == "__main__":
# 针对您的数据特点优化的参数
CHAR_PARAMS = (4, 0.1)
SYMBOL_LETTER_PARAMS = (8, 0.05) # 专门针对符号/字母重复
WORD_PARAMS = (2, 10, 2, 0.08)
SHORT_SENTENCE_PARAMS = (5, 5, 0.1)
PATTERN_PARAMS = (4, 0.3, 4, 1) # 降低句式检测敏感度
NGRAM_PARAMS = (6, 15, 3, 0.08)
print("启动高性能重复内容检测工具...")
start_time = time.time()
removed_count = clean_json_file(
input_path='toxic-sft-zh-plus.json',
output_path='toxic-sft-zh-plus-new.json'
)
total_time = time.time() - start_time
print(f"\n处理完成！总耗时: {total_time:.2f}秒")

复制代码

效果如下：

(venv) D:\AI\data_tools>python quchong.py
启动高性能重复内容检测工具...
开始处理 56481 条数据...
处理进度: 1000/56481 (1.8%), 预计剩余时间: 28.2秒
处理进度: 2000/56481 (3.5%), 预计剩余时间: 30.5秒
处理进度: 3000/56481 (5.3%), 预计剩余时间: 30.2秒
处理进度: 4000/56481 (7.1%), 预计剩余时间: 30.2秒
处理进度: 5000/56481 (8.9%), 预计剩余时间: 30.2秒
处理进度: 6000/56481 (10.6%), 预计剩余时间: 29.4秒
处理进度: 7000/56481 (12.4%), 预计剩余时间: 28.7秒
处理进度: 8000/56481 (14.2%), 预计剩余时间: 28.1秒
处理进度: 9000/56481 (15.9%), 预计剩余时间: 27.6秒
处理进度: 10000/56481 (17.7%), 预计剩余时间: 27.0秒
处理进度: 11000/56481 (19.5%), 预计剩余时间: 26.4秒
处理进度: 12000/56481 (21.2%), 预计剩余时间: 25.8秒
处理进度: 13000/56481 (23.0%), 预计剩余时间: 25.1秒
处理进度: 14000/56481 (24.8%), 预计剩余时间: 24.5秒
处理进度: 15000/56481 (26.6%), 预计剩余时间: 23.9秒
处理进度: 16000/56481 (28.3%), 预计剩余时间: 23.3秒
处理进度: 17000/56481 (30.1%), 预计剩余时间: 22.8秒
处理进度: 18000/56481 (31.9%), 预计剩余时间: 22.2秒
处理进度: 19000/56481 (33.6%), 预计剩余时间: 21.6秒
处理进度: 20000/56481 (35.4%), 预计剩余时间: 21.0秒
处理进度: 21000/56481 (37.2%), 预计剩余时间: 20.4秒
处理进度: 22000/56481 (39.0%), 预计剩余时间: 19.9秒
处理进度: 23000/56481 (40.7%), 预计剩余时间: 19.3秒
处理进度: 24000/56481 (42.5%), 预计剩余时间: 18.8秒
处理进度: 25000/56481 (44.3%), 预计剩余时间: 18.2秒
处理进度: 26000/56481 (46.0%), 预计剩余时间: 17.6秒
处理进度: 27000/56481 (47.8%), 预计剩余时间: 17.1秒
处理进度: 28000/56481 (49.6%), 预计剩余时间: 16.5秒
处理进度: 29000/56481 (51.3%), 预计剩余时间: 16.0秒
处理进度: 30000/56481 (53.1%), 预计剩余时间: 15.4秒
处理进度: 31000/56481 (54.9%), 预计剩余时间: 14.8秒
处理进度: 32000/56481 (56.7%), 预计剩余时间: 14.2秒
处理进度: 33000/56481 (58.4%), 预计剩余时间: 13.7秒
处理进度: 34000/56481 (60.2%), 预计剩余时间: 13.1秒
处理进度: 35000/56481 (62.0%), 预计剩余时间: 12.5秒
处理进度: 36000/56481 (63.7%), 预计剩余时间: 11.9秒
处理进度: 37000/56481 (65.5%), 预计剩余时间: 11.3秒
处理进度: 38000/56481 (67.3%), 预计剩余时间: 10.8秒
处理进度: 39000/56481 (69.0%), 预计剩余时间: 10.2秒
处理进度: 40000/56481 (70.8%), 预计剩余时间: 9.6秒
处理进度: 41000/56481 (72.6%), 预计剩余时间: 9.0秒
处理进度: 42000/56481 (74.4%), 预计剩余时间: 8.4秒
处理进度: 43000/56481 (76.1%), 预计剩余时间: 7.8秒
处理进度: 44000/56481 (77.9%), 预计剩余时间: 7.2秒
处理进度: 45000/56481 (79.7%), 预计剩余时间: 6.6秒
处理进度: 46000/56481 (81.4%), 预计剩余时间: 6.0秒
处理进度: 47000/56481 (83.2%), 预计剩余时间: 5.4秒
处理进度: 48000/56481 (85.0%), 预计剩余时间: 4.9秒
处理进度: 49000/56481 (86.8%), 预计剩余时间: 4.3秒
处理进度: 50000/56481 (88.5%), 预计剩余时间: 3.7秒
处理进度: 51000/56481 (90.3%), 预计剩余时间: 3.1秒
处理进度: 52000/56481 (92.1%), 预计剩余时间: 2.5秒
处理进度: 53000/56481 (93.8%), 预计剩余时间: 2.0秒
处理进度: 54000/56481 (95.6%), 预计剩余时间: 1.4秒
处理进度: 55000/56481 (97.4%), 预计剩余时间: 0.8秒
处理进度: 56000/56481 (99.1%), 预计剩余时间: 0.3秒
清理完成！原始条目: 56481
保留条目: 31454 (55.7%)
移除条目: 25027 (44.3%)
处理速度: 1749.2 条/秒
移除原因统计:
- 句式重复: 23969 条 (95.8%)
- n-gram重复: 540 条 (2.2%)
- 短句重复: 446 条 (1.8%)
- 连续词语重复: 65 条 (0.3%)
- 连续字符重复: 7 条 (0.0%)
重复内容示例分析:
检测到: 句式重复
检测到句式重复: '萨德-马...' 重复 5 次
处理完成！总耗时: 32.58秒

复制代码

一个数据清洗去重的脚本

灌水之王

论坛元老

咸鱼勋章