|
在做Qwen3指令监督微调时,其中一份生成式数据集存在大量重复文字、词语、短语、句式
于是让AI写了个去重脚本:
重复内容有多种表现形式: - 字符级重复:啊啊啊啊、数据数据数据
- 词语级重复:重要重要重要、测试测试测试
- 短句级重复:这是一个测试。这是一个测试。这是一个测试。
- 句式级重复:重要的是...、首先...、步骤一...
单一层次的检测无法覆盖所有情况,需要综合多种检测方法。
参数:
- min_length: n-gram最小长度
- max_length: n-gram最大长度
- min_repeats: n-gram最少重复次数
- min_ratio: 重复n-gram占总内容比例阈值
脚本如下:
- import json
- import re
- from collections import Counter
- import time
- import sys
- def has_high_char_repetition(text, min_repeats=4, max_ratio=0.1):
- """
- 检测连续单字重复(如"啊啊啊")
- """
- if not text:
- return False
-
- # 提取所有中文字符
- chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
- if not chinese_chars:
- return False
-
- total_chars = len(chinese_chars)
-
- # 检测连续重复字符
- repeat_count = 0
- i = 0
- while i < len(chinese_chars) - min_repeats + 1:
- current_char = chinese_chars[i]
- repeat_len = 1
-
- # 检查连续重复
- while i + repeat_len < len(chinese_chars) and chinese_chars[i + repeat_len] == current_char:
- repeat_len += 1
-
- # 如果重复达到阈值
- if repeat_len >= min_repeats:
- repeat_count += repeat_len
- i += repeat_len
- else:
- i += 1
-
- return repeat_count / total_chars >= max_ratio
- def has_high_symbol_letter_repetition(text, min_repeats=8, max_ratio=0.05):
- """
- 检测符号和字母的重复(大小写敏感)
-
- 优化点:
- - 专门处理字母(大小写)和符号重复
- - 提高重复阈值避免误伤正常内容
- - 分别检测连续重复和周期性重复
-
- 参数:
- - min_repeats: 最小重复次数(提高到8避免误伤)
- - max_ratio: 重复内容占比阈值(降低到5%)
-
- 返回:
- - bool: 是否存在高重复符号/字母
- """
- if not text:
- return False
-
- # 提取所有非中文字符(字母、数字、符号)
- non_chinese = re.findall(r'[^\u4e00-\u9fff]', text)
- if not non_chinese or len(non_chinese) < min_repeats * 2:
- return False
-
- total_non_chinese = len(non_chinese)
-
- # 检测1: 连续相同字符重复 (如 "aaaaaaaa")
- consecutive_repeats = 0
- i = 0
- while i < len(non_chinese) - min_repeats + 1:
- current_char = non_chinese[i]
- repeat_len = 1
-
- while i + repeat_len < len(non_chinese) and non_chinese[i + repeat_len] == current_char:
- repeat_len += 1
-
- if repeat_len >= min_repeats:
- consecutive_repeats += repeat_len
- i += repeat_len
- else:
- i += 1
-
- if consecutive_repeats / total_non_chinese >= max_ratio:
- return True
-
- # 检测2: 周期性重复 (如 "abababab")
- # 只检查长度为2-5的周期模式
- for pattern_len in [2, 3, 4, 5]:
- if total_non_chinese < pattern_len * min_repeats:
- continue
-
- i = 0
- periodic_repeats = 0
-
- while i <= len(non_chinese) - pattern_len * min_repeats:
- pattern = non_chinese[i:i+pattern_len]
- repeat_len = 1
-
- # 检查周期性重复
- while (i + pattern_len * (repeat_len + 1) <= len(non_chinese) and
- non_chinese[i + pattern_len * repeat_len:i + pattern_len * (repeat_len + 1)] == pattern):
- repeat_len += 1
-
- # 如果周期重复达到阈值
- if repeat_len >= min_repeats:
- # 计算重复字符数(排除第一次出现)
- periodic_repeats += pattern_len * (repeat_len - 1)
- i += pattern_len * repeat_len
- else:
- i += 1
-
- if periodic_repeats / total_non_chinese >= max_ratio:
- return True
-
- return False
- def has_high_word_repetition(text, min_word_length=2, max_word_length=10,
- min_repeats=2, max_ratio=0.08):
- """
- 检测连续词语重复(优化版)
- """
- if not text:
- return False
-
- # 提取所有中文字符序列
- chinese_text = ''.join(re.findall(r'[\u4e00-\u9fff]', text))
- if len(chinese_text) < min_word_length * min_repeats:
- return False
-
- total_length = len(chinese_text)
- if total_length == 0:
- return False
-
- # 优化:只检查特定长度的词语(2, 4, 6, 8, 10字)
- word_lengths = [2, 4, 6, 8, 10]
- max_repeat_ratio = 0
-
- for word_len in word_lengths:
- if word_len > max_word_length or word_len > total_length // min_repeats:
- continue
-
- i = 0
- repeat_chars = 0
-
- while i <= len(chinese_text) - word_len * min_repeats:
- word = chinese_text[i:i+word_len]
- repeat_len = 1
-
- # 检查连续重复
- while (i + word_len * (repeat_len + 1) <= len(chinese_text) and
- chinese_text[i + word_len * repeat_len:i + word_len * (repeat_len + 1)] == word):
- repeat_len += 1
-
- # 如果重复达到阈值
- if repeat_len >= min_repeats:
- repeat_chars += word_len * (repeat_len - 1)
- i += word_len * repeat_len
- else:
- i += 1
-
- # 计算该长度词语的重复比例
- if repeat_chars > 0:
- repeat_ratio = repeat_chars / total_length
- max_repeat_ratio = max(max_repeat_ratio, repeat_ratio)
-
- return max_repeat_ratio >= max_ratio
- def has_high_short_sentence_repetition(text, min_length=5, min_repeats=5, min_ratio=0.1):
- """
- 检测短句重复(优化版)
- """
- if not text or len(text) < min_length * min_repeats:
- return False
-
- # 分割句子(使用中文标点)
- sentences = re.split(r'[。!?;…\n]', text)
- # 过滤无效句子
- sentences = [s.strip() for s in sentences if len(s.strip()) >= min_length]
-
- if len(sentences) < min_repeats:
- return False
-
- # 特别优化:检测极高密度重复
- sentence_counts = Counter(sentences)
- for sentence, count in sentence_counts.most_common(1):
- if count >= 8: # 只需重复8次就标记
- return True
-
- # 计算重复情况
- total_length = sum(len(s) for s in sentences)
- repeated_length = 0
-
- for sentence, count in sentence_counts.items():
- if count >= min_repeats:
- repeated_length += len(sentence) * (count - 1)
-
- return repeated_length / total_length >= min_ratio
- def has_high_sentence_pattern_repetition(text,
- min_repeated_patterns=4, # 提高到4避免误伤
- pattern_threshold=0.3, # 提高到30%避免误伤
- pattern_length=4,
- max_common_patterns=1):
- """
- 检测句式重复(优化版 - 降低敏感度)
-
- 关键调整:
- - min_repeated_patterns 从2提高到4
- - pattern_threshold 从15%提高到30%
- - 避免将正常叙述误判为重复
- """
- if not text or len(text) < 50:
- return False
-
- # 分割句子
- sentences = re.split(r'[。!?;…\n]', text)
- # 过滤无效句子
- sentences = [
- s.strip()
- for s in sentences
- if len(s.strip()) > 8 and
- not re.match(r'^[\d#*●\-]\.?[\s\*]*', s) and
- len(s.strip()) < 100 # 排除超长句子(可能是代码)
- ]
-
- if len(sentences) < 8: # 需要至少8个句子才检测(避免短文本误判)
- return False
-
- # 提取句子开头模式(跳过常见开头)
- patterns = []
- common_starts = {"首先", "其次", "然后", "最后", "另外", "不过", "但是", "因此", "所以",
- "例如", "比如", "关于", "对于", "通过", "基于", "根据", "由于"}
-
- for s in sentences:
- clean_s = re.sub(r'^[^\u4e00-\u9fff]*', '', s)
- if len(clean_s) >= pattern_length:
- pattern = clean_s[:pattern_length]
- # 跳过常见开头
- if pattern not in common_starts:
- patterns.append(pattern)
-
- if not patterns or len(patterns) < 5:
- return False
-
- # 统计高频句式
- pattern_counts = Counter(patterns)
- total = len(patterns)
-
- # 检查是否存在高频重复句式
- for pattern, count in pattern_counts.most_common(1):
- # 需要同时满足:
- # 1. 重复次数 >= 4
- # 2. 占比 >= 30%
- # 3. 不是常见短语
- if count >= min_repeated_patterns and count / total >= pattern_threshold:
- return True
-
- return False
- def has_high_ngram_repetition(text, min_length=6, max_length=15, min_repeats=3, min_ratio=0.08):
- """
- 高性能n-gram重复检测
- """
- if not text or len(text) < min_length * min_repeats:
- return False
-
- # 提取中文文本
- chinese_text = ''.join(re.findall(r'[\u4e00-\u9fff]', text))
- if len(chinese_text) < min_length * min_repeats:
- return False
-
- total_length = len(chinese_text)
- if total_length < 20:
- return False
-
- # 优化:对长文本进行采样
- sample_points = []
- if total_length <= 500:
- sample_points = [(0, total_length)]
- else:
- sample_size = min(300, total_length // 2)
- sample_points = [
- (0, sample_size),
- (total_length // 2, sample_size),
- (total_length - sample_size, sample_size)
- ]
-
- # 只检查特定长度的n-gram
- ngram_lengths = [6, 9, 12, 15]
-
- for start, size in sample_points:
- end = min(start + size, total_length)
- if end <= start:
- continue
-
- sample_text = chinese_text[start:end]
-
- for n in ngram_lengths:
- if n > len(sample_text) // min_repeats:
- continue
-
- ngram_counts = {}
- max_count = 0
-
- for i in range(len(sample_text) - n + 1):
- ngram = sample_text[i:i+n]
- ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1
- if ngram_counts[ngram] > max_count:
- max_count = ngram_counts[ngram]
-
- # 早期退出:重复8次以上直接标记
- if max_count >= 8:
- return True
-
- # 检查是否有高重复n-gram
- for count in ngram_counts.values():
- if count >= min_repeats:
- repeat_ratio = n * (count - 1) / total_length
- if repeat_ratio >= min_ratio:
- return True
-
- return False
- def is_highly_repetitive(text,
- char_params=(4, 0.1),
- symbol_letter_params=(8, 0.05), # 专门针对符号/字母
- word_params=(2, 10, 2, 0.08),
- short_sentence_params=(5, 5, 0.1),
- pattern_params=(4, 0.3, 4, 1), # 降低句式敏感度
- ngram_params=(6, 15, 3, 0.08)):
- """
- 综合检测文本是否高度重复
-
- 优化点:
- 1. 添加专门的符号/字母重复检测
- 2. 降低句式重复检测敏感度
- 3. 优化检测顺序
- """
- # 1. 检查符号/字母重复(针对您的新问题)
- if has_high_symbol_letter_repetition(text, *symbol_letter_params):
- return True, "符号/字母重复"
-
- # 2. 快速检查:极高密度短句重复
- if has_high_short_sentence_repetition(text, *short_sentence_params):
- return True, "短句重复"
-
- # 3. 检查连续字符重复
- if has_high_char_repetition(text, *char_params):
- return True, "连续字符重复"
-
- # 4. 检查连续词语重复
- if has_high_word_repetition(text, *word_params):
- return True, "连续词语重复"
-
- # 5. 检查句式重复(降低敏感度后)
- if has_high_sentence_pattern_repetition(text, *pattern_params):
- return True, "句式重复"
-
- # 6. 检查n-gram重复
- if has_high_ngram_repetition(text, *ngram_params):
- return True, "n-gram重复"
-
- return False, "无显著重复"
- def clean_json_file(input_path, output_path, batch_size=1000):
- """
- 高性能清理JSON文件
- """
- # 检测文件格式
- with open(input_path, 'r', encoding='utf-8') as f:
- first_char = f.read(1)
- f.seek(0)
- is_json_lines = (first_char != '[')
-
- # 读取数据
- data = []
- with open(input_path, 'r', encoding='utf-8') as f:
- if is_json_lines:
- for line in f:
- line = line.strip()
- if line:
- try:
- data.append(json.loads(line))
- except json.JSONDecodeError:
- continue
- else:
- data = json.load(f)
-
- # 处理数据
- original_count = len(data)
- cleaned_data = []
- removed_items = []
- removal_reasons = Counter()
- start_time = time.time()
-
- print(f"开始处理 {original_count} 条数据...")
-
- # 处理每个条目
- for i, item in enumerate(data):
- # 显示进度
- if i % batch_size == 0 and i > 0:
- elapsed = time.time() - start_time
- items_per_sec = i / elapsed
- remaining = (original_count - i) / items_per_sec
- print(f" 处理进度: {i}/{original_count} ({i/original_count:.1%}), "
- f"预计剩余时间: {remaining:.1f}秒")
-
- # 合并所有文本字段
- text_fields = [
- item.get('instruction', ''),
- item.get('input', ''),
- item.get('output', '')
- ]
- full_text = "\n".join(text_fields)
-
- is_repetitive, reason = is_highly_repetitive(full_text)
- if not is_repetitive:
- cleaned_data.append(item)
- else:
- removed_items.append(item)
- removal_reasons[reason] += 1
-
- # 保存结果
- with open(output_path, 'w', encoding='utf-8') as f:
- if is_json_lines:
- # 修复了这里:将 cleaned_ 改为 cleaned_data
- for item in cleaned_data:
- f.write(json.dumps(item, ensure_ascii=False) + '\n')
- else:
- json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
-
- # 保存移除的条目
- if removed_items:
- removed_path = output_path.replace('.json', '_removed.json')
- with open(removed_path, 'w', encoding='utf-8') as f:
- if is_json_lines:
- for item in removed_items:
- f.write(json.dumps(item, ensure_ascii=False) + '\n')
- else:
- json.dump(removed_items, f, ensure_ascii=False, indent=2)
-
- # 打印统计信息
- total_time = time.time() - start_time
- print(f"\n清理完成!原始条目: {original_count}")
- print(f"保留条目: {len(cleaned_data)} ({len(cleaned_data)/original_count:.1%})")
- print(f"移除条目: {len(removed_items)} ({len(removed_items)/original_count:.1%})")
- print(f"处理速度: {original_count/total_time:.1f} 条/秒")
-
- if removed_items:
- print("\n移除原因统计:")
- for reason, count in removal_reasons.most_common():
- print(f" - {reason}: {count} 条 ({count/len(removed_items):.1%})")
-
- # 显示一个示例
- print("\n重复内容示例分析:")
- sample = removed_items[0]
- text_fields = [
- sample.get('instruction', ''),
- sample.get('input', ''),
- sample.get('output', '')
- ]
- full_text = "\n".join(text_fields)
-
- is_repetitive, reason = is_highly_repetitive(full_text)
- print(f"检测到: {reason}")
-
- # 根据重复类型显示具体重复内容
- if reason == "符号/字母重复":
- # 检测连续重复
- consecutive = re.search(r'([^\u4e00-\u9fff])\1{7,}', full_text)
- if consecutive:
- char = consecutive.group(1)
- print(f" 检测到连续符号/字母重复: '{char * 8}'")
-
- # 检测周期性重复
- if not consecutive:
- for pattern_len in [2, 3, 4]:
- pattern = r'([^\u4e00-\u9fff]{' + str(pattern_len) + r'})\1{3,}'
- periodic = re.search(pattern, full_text)
- if periodic:
- print(f" 检测到周期性重复: '{periodic.group(1) * 4}'")
- break
-
- elif reason == "连续字符重复":
- repeats = re.findall(r'([\u4e00-\u9fff])\1{3,}', full_text)
- if repeats:
- print(f" 检测到字符重复: '{repeats[0] * 4}'")
-
- elif reason == "连续词语重复":
- for word_len in range(2, 11):
- pattern = r'([\u4e00-\u9fff]{' + str(word_len) + r'})\1{1,}'
- matches = re.findall(pattern, full_text)
- if matches:
- print(f" 检测到词语重复: '{matches[0] * 2}'")
- break
-
- elif reason in ["短句重复", "n-gram重复"]:
- chinese_text = ''.join(re.findall(r'[\u4e00-\u9fff]', full_text))
- if len(chinese_text) > 20:
- sample_text = chinese_text[:min(200, len(chinese_text))]
- best_ngram = ""
- best_count = 0
-
- for n in range(5, 16):
- ngrams = [sample_text[i:i+n] for i in range(len(sample_text) - n + 1)]
- if ngrams:
- counter = Counter(ngrams)
- most_common, count = counter.most_common(1)[0]
- if count > best_count:
- best_count = count
- best_ngram = most_common
-
- if best_count >= 3:
- print(f" 检测到高频重复片段: '{best_ngram}' 重复 {best_count} 次")
-
- elif reason == "句式重复":
- sentences = re.split(r'[。!?;…\n]', full_text)
- sentences = [s.strip() for s in sentences if len(s.strip()) > 8]
- patterns = []
- for s in sentences:
- clean_s = re.sub(r'^[^\u4e00-\u9fff]*', '', s)
- if len(clean_s) >= 4:
- patterns.append(clean_s[:4])
- pattern_counts = Counter(patterns)
- for pattern, count in pattern_counts.most_common(1):
- if count >= 4: # 与检测阈值一致
- print(f" 检测到句式重复: '{pattern}...' 重复 {count} 次")
-
- return len(removed_items)
- if __name__ == "__main__":
- # 针对您的数据特点优化的参数
- CHAR_PARAMS = (4, 0.1)
- SYMBOL_LETTER_PARAMS = (8, 0.05) # 专门针对符号/字母重复
- WORD_PARAMS = (2, 10, 2, 0.08)
- SHORT_SENTENCE_PARAMS = (5, 5, 0.1)
- PATTERN_PARAMS = (4, 0.3, 4, 1) # 降低句式检测敏感度
- NGRAM_PARAMS = (6, 15, 3, 0.08)
-
- print("启动高性能重复内容检测工具...")
- start_time = time.time()
-
- removed_count = clean_json_file(
- input_path='toxic-sft-zh-plus.json',
- output_path='toxic-sft-zh-plus-new.json'
- )
-
- total_time = time.time() - start_time
- print(f"\n处理完成!总耗时: {total_time:.2f}秒")
复制代码
效果如下:
- (venv) D:\AI\data_tools>python quchong.py
- 启动高性能重复内容检测工具...
- 开始处理 56481 条数据...
- 处理进度: 1000/56481 (1.8%), 预计剩余时间: 28.2秒
- 处理进度: 2000/56481 (3.5%), 预计剩余时间: 30.5秒
- 处理进度: 3000/56481 (5.3%), 预计剩余时间: 30.2秒
- 处理进度: 4000/56481 (7.1%), 预计剩余时间: 30.2秒
- 处理进度: 5000/56481 (8.9%), 预计剩余时间: 30.2秒
- 处理进度: 6000/56481 (10.6%), 预计剩余时间: 29.4秒
- 处理进度: 7000/56481 (12.4%), 预计剩余时间: 28.7秒
- 处理进度: 8000/56481 (14.2%), 预计剩余时间: 28.1秒
- 处理进度: 9000/56481 (15.9%), 预计剩余时间: 27.6秒
- 处理进度: 10000/56481 (17.7%), 预计剩余时间: 27.0秒
- 处理进度: 11000/56481 (19.5%), 预计剩余时间: 26.4秒
- 处理进度: 12000/56481 (21.2%), 预计剩余时间: 25.8秒
- 处理进度: 13000/56481 (23.0%), 预计剩余时间: 25.1秒
- 处理进度: 14000/56481 (24.8%), 预计剩余时间: 24.5秒
- 处理进度: 15000/56481 (26.6%), 预计剩余时间: 23.9秒
- 处理进度: 16000/56481 (28.3%), 预计剩余时间: 23.3秒
- 处理进度: 17000/56481 (30.1%), 预计剩余时间: 22.8秒
- 处理进度: 18000/56481 (31.9%), 预计剩余时间: 22.2秒
- 处理进度: 19000/56481 (33.6%), 预计剩余时间: 21.6秒
- 处理进度: 20000/56481 (35.4%), 预计剩余时间: 21.0秒
- 处理进度: 21000/56481 (37.2%), 预计剩余时间: 20.4秒
- 处理进度: 22000/56481 (39.0%), 预计剩余时间: 19.9秒
- 处理进度: 23000/56481 (40.7%), 预计剩余时间: 19.3秒
- 处理进度: 24000/56481 (42.5%), 预计剩余时间: 18.8秒
- 处理进度: 25000/56481 (44.3%), 预计剩余时间: 18.2秒
- 处理进度: 26000/56481 (46.0%), 预计剩余时间: 17.6秒
- 处理进度: 27000/56481 (47.8%), 预计剩余时间: 17.1秒
- 处理进度: 28000/56481 (49.6%), 预计剩余时间: 16.5秒
- 处理进度: 29000/56481 (51.3%), 预计剩余时间: 16.0秒
- 处理进度: 30000/56481 (53.1%), 预计剩余时间: 15.4秒
- 处理进度: 31000/56481 (54.9%), 预计剩余时间: 14.8秒
- 处理进度: 32000/56481 (56.7%), 预计剩余时间: 14.2秒
- 处理进度: 33000/56481 (58.4%), 预计剩余时间: 13.7秒
- 处理进度: 34000/56481 (60.2%), 预计剩余时间: 13.1秒
- 处理进度: 35000/56481 (62.0%), 预计剩余时间: 12.5秒
- 处理进度: 36000/56481 (63.7%), 预计剩余时间: 11.9秒
- 处理进度: 37000/56481 (65.5%), 预计剩余时间: 11.3秒
- 处理进度: 38000/56481 (67.3%), 预计剩余时间: 10.8秒
- 处理进度: 39000/56481 (69.0%), 预计剩余时间: 10.2秒
- 处理进度: 40000/56481 (70.8%), 预计剩余时间: 9.6秒
- 处理进度: 41000/56481 (72.6%), 预计剩余时间: 9.0秒
- 处理进度: 42000/56481 (74.4%), 预计剩余时间: 8.4秒
- 处理进度: 43000/56481 (76.1%), 预计剩余时间: 7.8秒
- 处理进度: 44000/56481 (77.9%), 预计剩余时间: 7.2秒
- 处理进度: 45000/56481 (79.7%), 预计剩余时间: 6.6秒
- 处理进度: 46000/56481 (81.4%), 预计剩余时间: 6.0秒
- 处理进度: 47000/56481 (83.2%), 预计剩余时间: 5.4秒
- 处理进度: 48000/56481 (85.0%), 预计剩余时间: 4.9秒
- 处理进度: 49000/56481 (86.8%), 预计剩余时间: 4.3秒
- 处理进度: 50000/56481 (88.5%), 预计剩余时间: 3.7秒
- 处理进度: 51000/56481 (90.3%), 预计剩余时间: 3.1秒
- 处理进度: 52000/56481 (92.1%), 预计剩余时间: 2.5秒
- 处理进度: 53000/56481 (93.8%), 预计剩余时间: 2.0秒
- 处理进度: 54000/56481 (95.6%), 预计剩余时间: 1.4秒
- 处理进度: 55000/56481 (97.4%), 预计剩余时间: 0.8秒
- 处理进度: 56000/56481 (99.1%), 预计剩余时间: 0.3秒
- 清理完成!原始条目: 56481
- 保留条目: 31454 (55.7%)
- 移除条目: 25027 (44.3%)
- 处理速度: 1749.2 条/秒
- 移除原因统计:
- - 句式重复: 23969 条 (95.8%)
- - n-gram重复: 540 条 (2.2%)
- - 短句重复: 446 条 (1.8%)
- - 连续词语重复: 65 条 (0.3%)
- - 连续字符重复: 7 条 (0.0%)
- 重复内容示例分析:
- 检测到: 句式重复
- 检测到句式重复: '萨德-马...' 重复 5 次
- 处理完成!总耗时: 32.58秒
复制代码 |
|