1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
|
#!/usr/bin/env python3
# coding: utf-8
"""
功能:
- 从 data.txt 中按顺序精确提取:身份证(idcard)、手机号(phone)、银行卡(bankcard)、IPv4(ip)、MAC(mac)。
- 严格遵循《个人信息数据规范文档》,优化正则表达式和匹配策略以达到高准确率。
- 所有匹配项均保留原始格式,并输出到 output.csv 文件中。
"""
import re
import csv
from datetime import datetime
# ------------------- 配置 -------------------
INPUT_FILE = "data.txt"
OUTPUT_FILE = "output.csv"
DEBUG = False # 设置为 True 以在控制台打印详细的接受/拒绝日志
# 手机号前缀白名单
ALLOWED_MOBILE_PREFIXES = {
"134", "135", "136", "137", "138", "139", "147", "148", "150", "151", "152", "157", "158", "159",
"172", "178", "182", "183", "184", "187", "188", "195", "198", "130", "131", "132", "140", "145",
"146", "155", "156", "166", "167", "171", "175", "176", "185", "186", "196", "133", "149", "153",
"173", "174", "177", "180", "181", "189", "190", "191", "193", "199"
}
# ---------------------------------------------
# ------------------- 校验函数 -------------------
def luhn_check(digits: str) -> bool:
"""对数字字符串执行Luhn算法校验。"""
s = 0
alt = False
for char in reversed(digits):
d = int(char)
if alt:
d *= 2
if d > 9:
d -= 9
s += d
alt = not alt
return s % 10 == 0
def is_valid_id(raw: str):
"""校验身份证号的有效性(长度、格式、出生日期、校验码)。"""
sep_pattern = r'[\s\-\u00A0\u3000\u2013\u2014]'
s = re.sub(sep_pattern, '', raw)
if len(s) != 18 or not re.match(r'^\d{17}[0-9Xx]$', s):
return False, "无效的格式或长度"
try:
birth_date = datetime.strptime(s[6:14], "%Y%m%d")
if not (1900 <= birth_date.year <= datetime.now().year):
return False, f"无效的出生年份: {birth_date.year}"
except ValueError:
return False, "无效的出生日期"
weights = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
check_map = ['1', '0', 'X', '9', '8', '7', '6', '5', '4', '3', '2']
total = sum(int(digit) * weight for digit, weight in zip(s[:17], weights))
expected_check = check_map[total % 11]
if s[17].upper() != expected_check:
return False, f"校验码不匹配: 期望值 {expected_check}"
return True, ""
def is_valid_phone(raw: str) -> bool:
"""校验手机号的有效性(长度和号段)。"""
digits = re.sub(r'\D', '', raw)
if digits.startswith("86") and len(digits) > 11:
digits = digits[2:]
return len(digits) == 11 and digits[:3] in ALLOWED_MOBILE_PREFIXES
def is_valid_bankcard(raw: str) -> bool:
"""校验银行卡号的有效性(16-19位纯数字 + Luhn算法)。"""
if not (16 <= len(raw) <= 19 and raw.isdigit()):
return False
return luhn_check(raw)
def is_valid_ip(raw: str) -> bool:
"""校验IPv4地址的有效性(4个0-255的数字,不允许前导零)。"""
parts = raw.split('.')
if len(parts) != 4:
return False
# 检查是否存在无效部分,如 '01'
if any(len(p) > 1 and p.startswith('0') for p in parts):
return False
return all(p.isdigit() and 0 <= int(p) <= 255 for p in parts)
def is_valid_mac(raw: str) -> bool:
"""校验MAC地址的有效性。"""
# 正则表达式已经非常严格,这里仅做最终确认
return re.fullmatch(r'([0-9a-fA-F]{2}:){5}[0-9a-fA-F]{2}', raw, re.IGNORECASE) is not None
# ------------------- 正则表达式定义 -------------------
# 模式的顺序经过精心设计,以减少匹配歧义:优先匹配格式最特殊的。
# 1. MAC地址:格式明确,使用冒号分隔。
mac_pattern = r'(?P<mac>(?:[0-9a-fA-F]{2}:){5}[0-9a-fA-F]{2})'
# 2. IP地址:格式明确,使用点分隔。该正则更精确,避免匹配如 256.1.1.1 的无效IP。
ip_pattern = r'(?P<ip>(?<!\d)(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?!\d))'
# 3. 身份证号:结构为 6-8-4,长度固定,比纯数字的银行卡更具特异性。
sep = r'[\s\-\u00A0\u3000\u2013\u2014]'
id_pattern = rf'(?P<id>(?<!\d)\d{{6}}(?:{sep}*)\d{{8}}(?:{sep}*)\d{{3}}[0-9Xx](?!\d))'
# 4. 银行卡号:匹配16-19位的连续数字。这是最通用的长数字模式之一,放在后面匹配。
bankcard_pattern = r'(?P<bankcard>(?<!\d)\d{16,19}(?!\d))'
# 5. 手机号:匹配11位数字的特定格式,放在最后以避免错误匹配更长数字串的前缀。
phone_prefix = r'(?:\(\+86\)|\+86\s*)'
phone_body = r'(?:\d{11}|\d{3}[ -]\d{4}[ -]\d{4})'
phone_pattern = rf'(?P<phone>(?<!\d)(?:{phone_prefix})?{phone_body}(?!\d))'
# 将所有模式编译成一个大的正则表达式
combined_re = re.compile(
f'{mac_pattern}|{ip_pattern}|{id_pattern}|{bankcard_pattern}|{phone_pattern}',
flags=re.UNICODE | re.IGNORECASE
)
# ------------------- 主逻辑 -------------------
def extract_from_text(text: str):
"""
使用单一的、组合的正则表达式从文本中查找所有候选者,并逐一校验。
"""
results = []
for match in combined_re.finditer(text):
kind = match.lastgroup
value = match.group(kind)
if kind == 'mac':
if is_valid_mac(value):
if DEBUG: print(f"【接受 mac】: {value}")
results.append(('mac', value))
elif DEBUG: print(f"【拒绝 mac】: {value}")
elif kind == 'ip':
if is_valid_ip(value):
if DEBUG: print(f"【接受 ip】: {value}")
results.append(('ip', value))
elif DEBUG: print(f"【拒绝 ip】: {value}")
elif kind == 'id':
is_valid, reason = is_valid_id(value)
if is_valid:
if DEBUG: print(f"【接受 idcard】: {value}")
results.append(('idcard', value))
else:
# 降级处理:如果作为身份证校验失败,则尝试作为银行卡校验
digits_only = re.sub(r'\D', '', value)
if is_valid_bankcard(digits_only):
if DEBUG: print(f"【接受 id->bankcard】: {value}")
# 规范要求保留原始格式
results.append(('bankcard', value))
elif DEBUG: print(f"【拒绝 id】: {value} (原因: {reason})")
elif kind == 'bankcard':
if is_valid_bankcard(value):
if DEBUG: print(f"【接受 bankcard】: {value}")
results.append(('bankcard', value))
elif DEBUG: print(f"【拒绝 bankcard】: {value}")
elif kind == 'phone':
if is_valid_phone(value):
if DEBUG: print(f"【接受 phone】: {value}")
results.append(('phone', value))
elif DEBUG: print(f"【拒绝 phone】: {value}")
return results
def main():
"""主函数:读取文件,执行提取,写入CSV。"""
try:
with open(INPUT_FILE, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
except FileNotFoundError:
print(f"错误: 输入文件 '{INPUT_FILE}' 未找到。请确保该文件存在于脚本运行目录下。")
# 创建一个空的data.txt以确保脚本可以运行
with open(INPUT_FILE, "w", encoding="utf-8") as f:
f.write("")
print(f"已自动创建空的 '{INPUT_FILE}'。请向其中填充需要分析的数据。")
text = ""
extracted_data = extract_from_text(text)
with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["category", "value"])
writer.writerows(extracted_data)
print(f"分析完成。共识别 {len(extracted_data)} 条有效敏感数据。结果已保存至 '{OUTPUT_FILE}'。")
if __name__ == "__main__":
main()
|