python爬虫实例(urllib&Be
python 2.7.6
urllib:发送报文并得到response
BeautifulSoup:解析报文的body(html)
#encoding=UTF-8
from bs4 import BeautifulSoup
from urllib import urlopen
import urllib
list_no_results=[]#没查到的银行卡的list
list_yes_results=[]#已查到的银行卡的list
#解析报文,以字典存储
def parseData(htmls,code):
dic={}
s=BeautifulSoup(htmls)
if code=='00':
list=s.find_all('td','STYLE2',align=False)
dic={'id':list[0].string,'lt':list[1].string,'type':list[2].string}
if code=='01':
list=s.find_all('li')
dic['id']=list[3].get_text('|').split('|')[1]
dic['lt']=list[4].get_text('|').split('|')[1]
dic['type']=list[5].get_text('|').split('|')[1]
return dic #卡号id, 归属地lt,卡种type dict
#向网站00和01发送查询银行卡信息,并得到报文body,交parseData处理,返回result字典
def getInfoFromWEB00(cardNumStr):
#Http://www.6wm.cn/card/6222020200094043425.html
url_get='http://www.6wm.cn/card/'+cardNumStr+'.html'
get_2=urlopen(url_get).read()
if '404 Not Found' not in get_2:
#pare data
return parseData(get_2,'00')
else:
data=urllib.urlencode({'cardNum':cardNumStr})
url_query='http://www.6wm.cn/bank.PHP'
post_1=urlopen(url_query,data=data)
get_2=urlopen(url_get).read()
#pare data
if '404 Not Found' not in get_2:
return parseData(get_2,'00')
else:
list_no_results.append(cardNumStr)
return False
def getInfoFromWeb01(cardNumStr):
#http://www.cha789.com/bank_6228212028001510771.html
url_get='http://www.cha789.com/bank_'+cardNumStr+'.html'
get_1=urlopen(url_get).read()
if 'cxContext' not in get_1:
list_no_results.append(cardNumStr)
return False
else:
return parseData(get_1,'01')
if __name__=='__main__':
list_card=['……','……']#list of card string
……
相关文章