Shadow
python抓取个性说说网内容,并分隔内容语句入库
python抓取个性说说网内容,并分隔内容语句入库
废话不多说,直接上代码,代码中的数据库和相关的依赖都改成自己的
#!/usr/bin/python3
#-*- conding:utf-8 -*-
# https://www.gexings.com/ 抓取个性说说网
import requests,re,os,html as Htmls,time,MySQLdb
# 获取说说网的内容
def getToask(url):
# 抓取的网页链接
catUrl = url
headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36'}
r = requests.get(catUrl, headers=headers) # Htmls.unescape(trueUrl[0]) 把网页的特殊符号转换为正常符号
# 设置编码
r.encoding = r.apparent_encoding
# 获取页面的html
html = (r.text)
# 获取栏目的名称
catReg = re.compile(r'<strong>当前位置:</strong>(.*?)</div>')
catReg2 = re.compile(r'<a href=\'\S+\'>(\w+)<\/a>')
catHtml = catReg.findall(html)
catStrs = catReg2.findall(catHtml[0])
catName = (catStrs[len(catStrs) - 1])
# 获取文章列表标签
reg = re.compile(r'<div class="listbox">[\s\n\r]+<ul class="e2">([\s\S]*</li>)[\s\n\r]+</ul>[\s\n\r]+</div>[\s\n\r]+<!-- /listbox -->')
lists = reg.findall(html)
if len(lists) <= 0:
return ('error: 未查询到文章列表标签')
# 查询所有的文章链接
regHref = re.compile(r'(http[s]:\/\/www\.gexings\.com\/[\w\/]+\d+\.html)')
hrefs = regHref.findall(lists[0])
if len(hrefs) <= 0:
return ('error: 未查询到文章链接')
# 遍历文章链接
contentReg = re.compile(r'<div class="content">([\s\S]*)[\s\n\r]+<strong>相关阅读:</strong>')
# 文章详情的短语正则
phraseReg = re.compile(r'<p>(.+?)</p>')
#所有内容分隔的内容
allLists = []
valSql = ''
for href in hrefs:
# 获取文章详情内容
ccontent = requests.get(href, headers=headers) # Htmls.unescape(trueUrl[0]) 把网页的特殊符号转换为正常符号
# 设置编码
ccontent.encoding = ccontent.apparent_encoding
# 获取页面的html
htmlContent = (ccontent.text)
contents = contentReg.findall(htmlContent)
if len(contents) <= 0:
continue
# 文章内容用P标签区分
phrases = phraseReg.findall(contents[0])
# print('内容短语:', len(phrases))
if len(phrases) <= 0:
continue
for juzi in phrases:
# 碰到图片跳过
if re.match('<img', juzi) != None:
continue
juzi = re.sub('\s|�A|\n', '', juzi)
valSql += "('"+catName+"','"+juzi+"'),"
allLists.append(juzi)
# 开始处理整个栏目页面的所内容
if valSql == '':
return
insertSql = 'INSERT INTO contents(`catname`,`content`) VALUES '+ valSql.rstrip(',')
print(insertSql)
res = cursor.execute(insertSql)
if res:
print('入库成功')
else:
print('入库失败')
print('栏目名称:', catName)
print('短语长度:', len(allLists), '条句子')
print("====================\n")
# 打开数据库连接
db = MySQLdb.connect('localhost','root','root','toask',charset='utf8mb4')
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# 遍历栏目的页数
for i in range(24,28):
print('当前第:',i,'页')
url = 'https://www.gexings.com/mingyan/lizhi/list_49_{page}.html'.format(page=i)
# 抓取内容,并且分隔为短语
getToask(url)
# 关闭数据库连接
pass
print('抓取结束');
#
db.close()
附带一个mysql正则查询sql
SELECT COUNT(*) FROM contents WHERE content REGEXP '^[1234567890]+\.' ;
SELECT `id`,`content` FROM `contents` WHERE ( content REGEXP '^[一二三四五六七八九十]+、');
就此已经完成了某个分类的抓取并且把内容分了短语~
Dcr163的博客
http://dcr163.cn/462.html(转载时请注明本文出处及文章链接)