前期我们介绍过使用xpath解析数据,这次在原基础上将爬取下的数据直接保存到MongoDB中。 参考代码如下:from lxml import etree import requests import re import pymongoheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0' }# 根据url抓取网页内容 def getOnePage(url):resp = requests.get(url, headers=headers)try:# 服务器响应正常if resp.status_code == 200:return resp.textreturn Noneexcept Exception:return None# 分析HTML代码 xpath 获取内容 使用正则表达式匹配所需字符串 def parseOnePage(html):# 获取连接对象client = pymongo.MongoClient()# 获取数据库对象如果db-books不存在新建db = client['db-novels']# 获取要操作的集合 如果此集合不存在 会新建collection = db['collection-book']selector_html = etree.HTML(html)#选取节点 获取所有的图书的divitems = selector_html.xpath('//div[@class="doulist-item"]')# 遍历divfor item in items:# 图书的图片地址pic = item.xpath('.//div[@class="post"]/a/img/@src')[0]bname = item.xpath('.//div[@class="title"]/a/text()')[0]bname = re.search("\\w+", bname)bname = bname.group()rate = item.xpath('.//div[@class="rating"]/span[last()-1]/text()')[0]author = item.xpath('.//div[@class="abstract"]/text()')[0]author = re.search("(?<=作者:\\s)(.*)", author, re.M)if author is not None:author = author.group()company = item.xpath('.//div[@class="abstract"]/text()')[1]company = re.search("(?<=出版社:\\s)(.*)", company)company = company.group()date = item.xpath('.//div[@class="abstract"]/text()')[2]date = re.search("\\d{4}(-\\d{1,2})?", date)if date is not None:date = date.group()print(bname+'\t'+author+'\t'+company+'\t'+date+'\t'+rate+'\t'+pic)# 将数据存储在列表中list = [['bname',bname],['author',author],['company',company],['b-date',date],['rate',rate],['pic-url',pic]]# 将列表转为字典类型row = dict(list)print(row)# 将数据插入到数据库表中collection.insert_one(row)#抓取URL页面,并保存到文件中 def getTop100(url):# 获取页面的数据html = getOnePage(url)# 从页面提取图书信息并保存到MongoDB数据库中parseOnePage(html)# 分页的四个Url地址 urls = ['https://www.douban.com/doulist/45004834/?start={}'.format( str(i) ) for i in range(0,100,25)] for url in urls:print(url)getTop100(url)
运行结果如下: