Python3 抓取糗事百科最新文本笑话
from bs4 import BeautifulSoup import requests #糗事百科 首页最新笑话抓取 class Downxh(object): def __init__(self): self.domain = 'https://www.qiushibaike.com' self.target = self.domain + '/text/' self.urls = [] # 根据首页获取列表url def getHtml(self): info = requests.get(self.target) html = info.text htmlObj = BeautifulSoup(html,'html.parser') cHtml = htmlObj.find('div',id = 'content-left') html2Obj = BeautifulSoup(str(cHtml),'html.parser') lists = html2Obj.find_all('a',class_ = 'contentHerf') for item in lists: self.urls.append( self.domain + item.get('href') ) # 遍历urls保存到txt文件 def saveContent(self): print('--正在下载最新' + str(len(self.urls)) + '条笑话--') f = open('最新笑话.txt', 'w', encoding='utf-8') i = 1 for url in self.urls: print('正在下载第:' + str(i) + '条') info = requests.get(url) html = info.text htmlObj = BeautifulSoup(html, 'html.parser') contents = htmlObj.find_all('div', class_='content') content = contents[0].text f.write(content + '\n\n') i = i+1 f.close() # 执行抓取 def run(self): self.getHtml() self.saveContent() print('下载完成') pass downObj = Downxh() # 运行 downObj.run()
如果没有:BeautifulSoup 和 requests 插件的请先用pip安装一下
pip install beautifulsoup4 pip install requests
Dcr163的博客
http://dcr163.cn/214.html(转载时请注明本文出处及文章链接)