Python re正则表达式

更新日期: 2025-10-26 分享

re模块是Python中处理正则表达式的标准库。正则表达式是一种强大的文本匹配工具，可以用来查找、替换和提取字符串中的特定内容。

为什么需要正则表达式

在处理文本时，我们经常需要：

验证用户输入的格式（如邮箱、电话号码）
从大量文本中提取特定信息
批量替换文本内容
清理和格式化数据

手动编写代码处理这些任务很麻烦，正则表达式可以大大简化这些工作。

基本使用方法

导入模块

import re

简单匹配示例

import re

# 查找数字
text = "我的电话是123-4567"
pattern = r'\d+'  # 匹配一个或多个数字

result = re.search(pattern, text)
if result:
    print("找到数字:", result.group())  # 输出: 找到数字: 123
else:
    print("没有找到数字")

常用匹配函数

search() - 查找第一个匹配

import re

text = "苹果价格是5元，香蕉价格是3元"
pattern = r'\d+元'

# 查找第一个匹配
match = re.search(pattern, text)
if match:
    print("找到:", match.group())  # 输出: 找到: 5元

findall() - 查找所有匹配

import re

text = "苹果5元，香蕉3元，橙子4元"
pattern = r'\d+元'

# 查找所有匹配
matches = re.findall(pattern, text)
print("所有价格:", matches)  # 输出: 所有价格: ['5元', '3元', '4元']

match() - 从开头匹配

import re

text1 = "123abc"
text2 = "abc123"
pattern = r'\d+'

# 只在字符串开头匹配
result1 = re.match(pattern, text1)
result2 = re.match(pattern, text2)

print("text1匹配:", result1.group() if result1 else "不匹配")  # 输出: 123
print("text2匹配:", result2.group() if result2 else "不匹配")  # 输出: 不匹配

sub() - 替换文本

import re

text = "今天是2023-05-15，会议在2023-05-20"
pattern = r'\d{4}-\d{2}-\d{2}'

# 替换日期格式
new_text = re.sub(pattern, "某年某月某日", text)
print(new_text)  # 输出: 今天是某年某月某日，会议在某年某月某日

正则表达式语法

基本元字符

import re

text = "abc 123 XYZ !@#"

# 匹配数字
numbers = re.findall(r'\d', text)
print("数字:", numbers)  # 输出: ['1', '2', '3']

# 匹配字母
letters = re.findall(r'[a-zA-Z]', text)
print("字母:", letters)  # 输出: ['a', 'b', 'c', 'X', 'Y', 'Z']

# 匹配非字母数字字符
symbols = re.findall(r'\W', text)
print("符号:", symbols)  # 输出: [' ', ' ', ' ', '!', '@', '#']

量词的使用

import re

text = "a ab abb abbb abbbb"

# 匹配不同数量的b
print("b出现0次或多次:", re.findall(r'ab*', text))    # ['a', 'ab', 'abb', 'abbb', 'abbbb']
print("b出现1次或多次:", re.findall(r'ab+', text))    # ['ab', 'abb', 'abbb', 'abbbb']
print("b出现0次或1次:", re.findall(r'ab?', text))     # ['a', 'ab', 'ab', 'ab', 'ab']
print("b出现2-3次:", re.findall(r'ab{2,3}', text))   # ['abb', 'abbb']

字符集合

import re

text = "cat bat rat mat hat"

# 匹配特定开头的单词
matches = re.findall(r'[bc]at', text)
print("匹配结果:", matches)  # 输出: ['cat', 'bat']

# 匹配非r和m开头的at单词
matches2 = re.findall(r'[^rm]at', text)
print("排除匹配:", matches2)  # 输出: ['cat', 'bat', 'hat']

分组和捕获

基本分组

import re

text = "2023-05-15 2024-06-20"

# 提取年月日
pattern = r'(\d{4})-(\d{2})-(\d{2})'
matches = re.findall(pattern, text)
print("分组匹配:", matches)  # 输出: [('2023', '05', '15'), ('2024', '06', '20')]

# 使用search获取分组
match = re.search(pattern, text)
if match:
    print("完整匹配:", match.group(0))    # 2023-05-15
    print("年份:", match.group(1))        # 2023
    print("月份:", match.group(2))        # 05
    print("日期:", match.group(3))        # 15

命名分组

import re

text = "姓名:张三 年龄:25 城市:北京"

# 使用命名分组
pattern = r'姓名:(?P<name>\w+)\s+年龄:(?P<age>\d+)\s+城市:(?P<city>\w+)'
match = re.search(pattern, text)

if match:
    print("姓名:", match.group('name'))  # 张三
    print("年龄:", match.group('age'))   # 25
    print("城市:", match.group('city'))  # 北京
    print("分组字典:", match.groupdict()) # {'name': '张三', 'age': '25', 'city': '北京'}

实际应用示例

验证邮箱格式

import re

def validate_email(email):
    """验证邮箱格式"""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    
    if re.match(pattern, email):
        return True
    else:
        return False

# 测试邮箱验证
emails = [
    "user@example.com",
    "hello.world@test.org",
    "invalid-email",
    "missing@dot",
    "test@sub.domain.com"
]

for email in emails:
    valid = validate_email(email)
    print(f"{email:25} -> {'有效' if valid else '无效'}")

提取手机号码

import re

def extract_phone_numbers(text):
    """提取手机号码"""
    # 匹配11位手机号，以1开头
    pattern = r'1[3-9]\d{9}'
    return re.findall(pattern, text)

# 测试提取手机号
text = """
联系方式：
张三：13812345678
李四：13987654321
王五：15011112222
无效号码：1234567890
"""

phone_numbers = extract_phone_numbers(text)
print("找到的手机号码:")
for phone in phone_numbers:
    print(phone)

清理html标签

import re

def remove_html_tags(html_text):
    """移除HTML标签"""
    # 匹配<...>格式的标签
    pattern = r'<[^>]+>'
    clean_text = re.sub(pattern, '', html_text)
    return clean_text

# 测试HTML清理
html_content = """
<div>
    <h1>标题</h1>
    <p>这是一个<strong>段落</strong>内容。</p>
    <a href="http://example.com">链接</a>
</div>
"""

clean_content = remove_html_tags(html_content)
print("清理后的文本:")
print(clean_content)

高级技巧

编译正则表达式

import re

# 编译正则表达式，提高重复使用性能
phone_pattern = re.compile(r'1[3-9]\d{9}')
email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')

text = "电话：13800138000，邮箱：test@example.com"

# 使用编译后的模式
phones = phone_pattern.findall(text)
emails = email_pattern.findall(text)

print("手机号:", phones)
print("邮箱:", emails)

使用标志参数

import re

text = """第一行：Hello
第二行：WORLD
第三行：python"""

# 忽略大小写
matches1 = re.findall(r'hello', text, re.IGNORECASE)
print("忽略大小写:", matches1)  # ['Hello']

# 多行模式
matches2 = re.findall(r'^第.*', text, re.MULTILINE)
print("多行匹配:", matches2)  # ['第一行：Hello', '第二行：WORLD', '第三行：python']

# 详细模式，可以添加注释
pattern = re.compile(r'''
    \b       # 单词边界
    \w+      # 一个或多个单词字符
    \b       # 单词边界
''', re.VERBOSE)

matches3 = pattern.findall(text)
print("单词匹配:", matches3)

非贪婪匹配

import re

text = "<div>内容1</div><div>内容2</div>"

# 贪婪匹配（默认）
greedy_match = re.findall(r'<div>.*</div>', text)
print("贪婪匹配:", greedy_match)  # ['<div>内容1</div><div>内容2</div>']

# 非贪婪匹配
non_greedy_match = re.findall(r'<div>.*?</div>', text)
print("非贪婪匹配:", non_greedy_match)  # ['<div>内容1</div>', '<div>内容2</div>']

实用工具函数

文本信息提取器

import re

class TextExtractor:
    """文本信息提取器"""
    
    def __init__(self):
        self.patterns = {
            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'phone': r'1[3-9]\d{9}',
            'url': r'https?://[^\s]+',
            'ip': r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
            'money': r'[￥$]?\s*\d+(?:\.\d{2})?'
        }
    
    def extract_all(self, text):
        """提取所有类型的信息"""
        results = {}
        
        for info_type, pattern in self.patterns.items():
            matches = re.findall(pattern, text)
            if matches:
                results[info_type] = matches
        
        return results

# 使用提取器
extractor = TextExtractor()

sample_text = """
联系信息：
邮箱：user@example.com, admin@test.org
电话：13800138000, 13912345678
网站：https://www.example.com
IP地址：192.168.1.1
金额：$100.50, ￥200元
"""

results = extractor.extract_all(sample_text)
for info_type, items in results.items():
    print(f"{info_type}: {items}")

数据清洗工具

import re

class DataCleaner:
    """数据清洗工具"""
    
    @staticmethod
    def remove_extra_spaces(text):
        """移除多余空格"""
        return re.sub(r'\s+', ' ', text).strip()
    
    @staticmethod
    def remove_special_chars(text, keep_chars=''):
        """移除特殊字符"""
        # 保留字母、数字、中文和指定的字符
        pattern = f'[^a-zA-Z0-9\u4e00-\u9fa5{re.escape(keep_chars)}]'
        return re.sub(pattern, '', text)
    
    @staticmethod
    def extract_numbers(text):
        """提取所有数字"""
        return re.findall(r'\d+\.?\d*', text)
    
    @staticmethod
    def format_phone(phone):
        """格式化手机号"""
        cleaned = re.sub(r'\D', '', phone)
        if len(cleaned) == 11:
            return f"{cleaned[:3]} {cleaned[3:7]} {cleaned[7:]}"
        return phone

# 测试数据清洗
cleaner = DataCleaner()

dirty_text = "  这  是   测试  文本  ！！！电话：138-0013-8000  "
print("原始文本:", repr(dirty_text))

cleaned_text = cleaner.remove_extra_spaces(dirty_text)
print("清理空格:", repr(cleaned_text))

no_special = cleaner.remove_special_chars(cleaned_text, '：')
print("移除特殊字符:", repr(no_special))

formatted_phone = cleaner.format_phone("138-0013-8000")
print("格式化电话:", formatted_phone)

常见问题解决

处理复杂匹配

import re

def parse_log_line(log_line):
    """解析日志行"""
    # 匹配常见的日志格式：时间 级别 消息
    pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+) (.*)'
    
    match = re.match(pattern, log_line)
    if match:
        return {
            'timestamp': match.group(1),
            'level': match.group(2),
            'message': match.group(3)
        }
    return None

# 测试日志解析
log_lines = [
    "2024-01-20 10:30:15 INFO 用户登录成功",
    "2024-01-20 10:31:20 ERROR 数据库连接失败",
    "无效的日志行"
]

for line in log_lines:
    parsed = parse_log_line(line)
    if parsed:
        print(f"时间: {parsed['timestamp']}, 级别: {parsed['level']}, 消息: {parsed['message']}")
    else:
        print(f"无法解析: {line}")

性能优化建议

import re
import time

# 不好的做法：每次编译
def slow_search(text, pattern):
    return re.search(pattern, text)

# 好的做法：预编译
class FastSearcher:
    def __init__(self, patterns):
        self.compiled_patterns = {}
        for name, pattern in patterns.items():
            self.compiled_patterns[name] = re.compile(pattern)
    
    def search(self, text, pattern_name):
        return self.compiled_patterns[pattern_name].search(text)

# 测试性能
patterns = {
    'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
    'phone': r'1[3-9]\d{9}'
}

searcher = FastSearcher(patterns)
text = "测试文本，邮箱：test@example.com，电话：13800138000"

# 使用预编译的模式
result = searcher.search(text, 'email')
if result:
    print("找到邮箱:", result.group())

快捷查询

1. 核心函数

方法	说明	示例
re.compile(pattern)	预编译正则表达式（提升复用性能）	pat = re.compile(r'\d+')
re.search(pattern, string)	搜索字符串中第一个匹配项	re.search(r'\d+', 'a1b2') → 匹配 '1'
re.match(pattern, string)	从字符串起始位置匹配	re.match(r'\d+', '123a') → 匹配 '123'
re.fullmatch(pattern, string)	整个字符串完全匹配	re.fullmatch(r'\d+', '123') → 匹配 '123'
re.findall(pattern, string)	返回所有非重叠匹配的列表	re.findall(r'\d+', 'a1b22c') → ['1', '22']
re.finditer(pattern, string)	返回所有匹配的迭代器（含位置信息）	for m in re.finditer(r'\d+', 'a1b2'): print(m.group())
re.sub(pattern, repl, string)	替换匹配项	re.sub(r'\d+', 'X', 'a1b2') → 'aXbX'
re.split(pattern, string)	按匹配项分割字符串	re.split(r'\d+', 'a1b2c') → ['a', 'b', 'c']

2. 匹配对象（Match）方法/属性

方法/属性	说明	示例
group()	返回整个匹配的字符串	m.group() → 'abc'
group(n)	返回第n个捕获组的内容	m = re.search(r'(\d)(\d)', '12'); m.group(1) → '1'
groups()	返回所有捕获组的元组	m.groups() → ('1', '2')
start()/end()	匹配的起始/结束位置	m.start() → 0
span()	返回匹配范围 (start, end)	m.span() → (0, 2)

3. 正则表达式元字符（部分）

元字符	说明	示例匹配
.	匹配任意字符（除换行符）	a.c → 'abc'
\d	匹配数字	\d+ → '123'
\D	匹配非数字	\D+ → 'abc'
\w	匹配单词字符（字母、数字、下划线）	\w+ → 'Ab_1'
\W	匹配非单词字符	\W+ → '!@#'
\s	匹配空白字符（空格、制表符等）	\s+ → ' \t'
\S	匹配非空白字符	\S+ → 'abc'
[]	字符集合	[A-Za-z] → 任意字母
^	匹配字符串开头	^\d+ → 开头的数字
$	匹配字符串结尾	\d+$ → 结尾的数字
*	匹配前一个字符0次或多次	a* → '', 'aaa'
+	匹配前一个字符1次或多次	a+ → 'a', 'aaa'
?	匹配前一个字符0次或1次	a? → '', 'a'
{m,n}	匹配前一个字符m到n次	a{2,3} → 'aa', 'aaa'
\|	或操作	cat\|dog → 'cat' 或 'dog'
()	捕获分组	(\d+) → 提取数字

4. 编译标志（flags 参数）

标志	说明	示例
re.IGNORECASE (re.I)	忽略大小写	re.search(r'abc', 'ABC', re.I)
re.MULTILINE (re.M)	多行模式（影响 ^ 和 $）	re.findall(r'^\d+', '1\n2', re.M) → ['1', '2']
re.DOTALL (re.S)	让 . 匹配包括换行符的所有字符	re.search(r'a.*b', 'a\nb', re.S)
re.ASCII	让 \w, \W 等仅匹配ASCII字符	re.search(r'\w+', 'こん', re.ASCII) → 无匹配
re.VERBOSE (re.X)	允许正则中添加注释和空格	re.compile(r'''\d+ # 匹配数字''', re.X)

总结

re模块提供了强大的文本处理能力：

模式匹配：search、match、findall等函数
文本替换：sub函数实现批量替换
分组提取：可以提取匹配的特定部分
复杂模式：支持各种复杂的匹配规则

使用技巧：

频繁使用的正则表达式应该预编译
使用原始字符串（r''）避免转义问题
合理使用贪婪和非贪婪匹配
复杂的正则表达式可以使用re.VERBOSE添加注释

本文内容仅供个人学习/研究/参考使用，不构成任何决策建议或专业指导。分享/转载时请标明原文来源，同时请勿将内容用于商业售卖、虚假宣传等非学习用途哦～感谢您的理解与支持！

链接: https://fly63.com/course/36_2144

<< Python datetime日期时间处理 Python csv表格数据处理 >>