关于python正则表达式场景收集(三)
本期收录正则表达式场景包括HTML解析和CSV解析,主要是匹配()中的内容,当然例子比较简单,复杂的类似css、class等解析建议还是通过专业的html解析包来解决。
如下:
import re
html_regex_dict={'tr': r'<tr>(.*?)</tr>',
'th': r'<th>(.*?)</th>',
'td': r'<td>(.*?)</td>',
'title': r'<title>(.*?)</title>',
'span': r'<span .*?>(.*?)</span>',
'a': r'<a .*?>(.*?)</a>',
'ahref': r'<a.*?href=.*?<\/a>',
'hrefurl': '<a[^>]+href=["\'](.*?)["\']' }
# <a[^>]+href=["\'](.*?)["\']表达式详解
# 先匹配一个<,
# 然后是字母a,
# 然后是一个或多个非>的任意字符,
# 然后依次匹配href+,
# 然后是"、\、'三个中的一个,
# 然后非贪婪匹配任意长的字符,
# 后再匹配"、\、'三个中的一个,
# 返回结果为括号里的内容。
html='''
<html>
<head>
<title>Document name goes here</title>
</head>
<body>
Visible text goes here
<a href="http://www.example.com/">This is a Link</a>
<a href="http://www.example.com/"><img src="URL" alt="Alternate Text"></a>
<a href="mailto:webmaster@example.com">Send e-mail</a>A named anchor:<a name="tips">Useful Tips Section</a>
<a href="#tips">Jump to the Useful Tips Section</a>
<table border="1">
<tr>
<th>标题1</th>
<th>标题2</th>
</tr>
<tr>
<td>数据11</td>
<td>数据12</td>
</tr>
<tr>
<td>数据21</td>
<td>数据22</td>
</tr>
</table>
<form action="http://www.example.com/test.asp" method="post/get">
<input type="text" name="lastname" value="Nixon" size="30" maxlength="50">
<input type="password">
<input type="checkbox" checked="checked">
<input type="radio" checked="checked">
<input type="submit">
<input type="reset">
<input type="hidden">
<select>
<option>Apples
<option selected>Bananas
<option>Cherries
</select>
<textarea name="Comment" rows="60" cols="20"></textarea>
</form>
<a href="#tips">Jump to the last Tips Section</a>
</body>
</html>
'''
# re.findall(pattern, string[, flags]):
# 搜索string,以列表形式返回全部能匹配的子串。其中RE的常见参数包括:
# re.I(re.IGNORECASE): 忽略大小写(括号内是完整写法)
# re.M(re.MULTILINE): 多行模式,改变'^'和'$'的行为
# re.S(re.DOTALL): 点任意匹配模式,改变'.'的行为
tagtitle = re.findall(html_regex_dict['title'],html,re.S|re.M)
print(tagtitle[])
tagtrs = re.findall(html_regex_dict['tr'],html,re.S|re.M)
for tr in tagtrs:
print ('tr=',tr)
#获取表格列th 属性
tagths = re.findall(html_regex_dict['th'],tr,re.S|re.M)
for th in tagths:
print('th=',th) #unicode防止乱
#获取表格第二列td 属性值
tagtds = re.findall(html_regex_dict['td'],tr,re.S|re.M)
for td in tagtds:
print('td=',td)
tagas = re.findall(html_regex_dict['a'], html, re.S|re.M)
for taga in tagas:
print(taga)
# This is a Link
# <img src="URL" alt="Alternate Text">
# Send e-mail
# Useful Tips Section
# Jump to the Useful Tips Section
# Jump to the last Tips Section
tagahref=re.findall(html_regex_dict['ahref'], html, re.I|re.S|re.M)
for ahref in tagahref:
print(ahref)
# <a href="http://www.example.com/">This is a Link</a>
# <a href="http://www.example.com/"><img src="URL" alt="Alternate Text"></a>
# <a href="mailto:webmaster@example.com">Send e-mail</a>
# <a name="tips">Useful Tips Section</a>
# <a href="#tips">Jump to the Useful Tips Section</a>
# <a href="#tips">Jump to the last Tips Section</a>
taghrefurls = re.findall(html_regex_dict['hrefurl'], html, re.I|re.S|re.M)
for url in taghrefurls:
print(url)
# http://www.example.com/
# http://www.example.com/
# mailto:webmaster@example.com
# #tips
# #tips
strs='''"TM_YX_YKQLC_YKBZ_DW","业扩报装统计表","8","每日"
"TM_YX_YKQLC_WCHJ_DW","完成环节统计表","9","每日"
"TM_YX_YKQLC_ZTYW_NEW","在途业务统计表","10","每日"
'''
reg=r'^"(?P<tablename>.*)","(?P<tabledesc>.*)","(?P<tableseq>\d+)","(?P<dataseq>.*)"$'
rows=re.findall(reg, strs, re.I|re.M)
for row in rows:
print(row[],row[1],row[2],row[3])
# TM_YX_YKQLC_YKBZ_DW 业扩报装统计表 8 每日
# TM_YX_YKQLC_WCHJ_DW 完成环节统计表 9 每日
# TM_YX_YKQLC_ZTYW_NEW 在途业务统计表 10 每日
相关文章