python爬取豆瓣新书清单

2023-01-31 01:01:48 清单豆瓣新书

使用python3的requests库快速获取豆瓣图书推荐的新书清单，并保存书籍信息和图书缩略图图片到本地

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author:aiker Zhao
@file:douban3.py
@time:上午10:34
"""
import JSON
import os
import re
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException

dir = 'z:\\douban\\'

def get_WEB(url):
    try:
        rq = requests.get(url)
        if rq.status_code == 200:
            return rq.text
        return None
    except RequestException:
        return None

def parse_web(html):
    pattern = re.compile('<li\sclass="">.*?cover".*?href="(.*?)"\stitle="(.*?)".*?img\***c="(.*?)"' +
                         '.*?class="author">(.*?)<.*?year">(.*?)<.*?publisher">(.*?)<.*?</li>', re.S)
    results = re.findall(pattern, html)
    # print(results)
    for i in results:
        # url, title, img, author, yeah, publisher = i
        # author = re.sub('\s', '', author)
        # yeah = re.sub('\s', '', yeah)
        # publisher = re.sub('\s', '', publisher)
        # print(url, title, img, author, yeah, publisher)
        yield {
            'title': i[1],
            'url': i[0],
            'img': i[2],
            'author': i[3].strip(),
            'yeah': i[4].strip(),
            'publisher': i[5].strip()
        }
        # print(url, title, img, author, yeah, publisher)
        # return img,title

def save_image(title, img):
    images = dir + title + '.jpg'
    if os.path.exists(images):
        pass
    else:
        with open(images, 'wb') as f:
            f.write(requests.get(img).content)
            f.close()

def save_info(content):
    info = dir + 'info.txt'
    with open(info, 'a', encoding='utf-8') as fd: #防止出现ascII
        fd.write(json.dumps(content, ensure_ascii=False) + '\n') ##防止出现ascII
        fd.close()

def main():
    url = 'https://book.douban.com/'
    html = get_web(url)
    # parse_web(html)
    for i in parse_web(html):
        print(i)
        save_info(i)
        save_image(i.get('title'), i.get('img'))

if __name__ == '__main__':
    main()

python爬取豆瓣新书清单

心得：
- 需要注意正则的匹配规则的准确度，否则会没有响应，或者无限超时

相关文章