Shadow
Python使用lxml抓取网站信息演示
python中使用lxml
python中使用lxml这个库抓取网页内容是比较简单的,下面是一个演示案例
#!/usr/bin/python3
# -*- conding:utf-8 -*-
import requests, hashlib
from lxml import etree
# 抓取的url
url = 'http://www.dcr163.cn'
# 设置hearder头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
}
# 请求url
res = requests.get(url, headers=headers)
# 生成etree对象,用来xpath解析
htmls = etree.HTML(res.text)
# 查找 mate 标签,属性http-equiv="Content-Language"] 的content 属性值
htmls.xpath('//meta[@http-equiv="Content-Language"]/@content')
# 获取网站导航
navList = htmls.xpath('//ul[@id="search-cloud"]/li/a')
for i in navList:
# print('标签:'+i.tag ,i.text,'==>',i.attrib['href'])
pass
# 获取文章列表
lists = htmls.xpath('//article[@class="article-list wow jSlideUp"]/a')
for i in lists:
# 内容链接
artUrl = i.attrib['href']
# 标题
artTitle = i.attrib['title']
print(artTitle, '==>', '链接', artUrl)
# 获取内容
cres = requests.get(artUrl)
# 创建etree对象
cHtmls = etree.HTML(cres.text)
# 获取内容元素,这里标示 获取div标签属性id="markdown" 的元素
content = cHtmls.xpath('//div[@id="markdown"]')[0]
# 转义成为正常的html标签
contentHtml = etree.tostring(content, encoding='utf-8', pretty_print=True, method='html').decode('utf-8')
print(contentHtml)
break
Dcr163的博客
http://dcr163.cn/558.html(转载时请注明本文出处及文章链接)