https://www.52pojie.cn/thread-1623166-1-1.html
这个网站的小说是这种样子的,也没有提供txt下载。
我的一个朋友想看,一次次点击很不方便 ,想让我帮忙下载个合集。我就写了个爬虫,整理到一个记事本中。
import requests
from bs4 import BeautifulSoup as bsp
import time
list_links=[]
list_biaoti=[]
list_neirong=[]
file_name='save.txt'
url=r'http://www.lianxinw.com/book/64221/'
def get_links(url:str): #获取所有的链接
qingqiu=requests.get(url=url)
wangye=bsp(qingqiu.text, 'lxml')
links=wangye.find_all('a', rel='chapter')
for i in links:
jieguo=i.get('href')
if r'/' in jieguo:
jieguo=jieguo.split(r'/')[-1]
list_links.append(url+jieguo)
list_links.sort() #爬取时候不知道为什么没按顺序,也可能是解析时候python的异步处理导致的
def get_content(url: str): #从一个url中,获取正文
#pipei=re.compile(r'<div id="content">(.*?)</div>')
list_neirong.clear()
qingqiu = requests.get(url=url)
wangye = bsp(qingqiu.text, 'lxml')
biaoti=wangye.find('h1').get_text('h1')
list_biaoti.append(biaoti)
jieguo= wangye.find('div', id="content")
for i in jieguo.find_all('p'):
list_neirong.append( i.get_text('p') )
def main():
get_links(url)
zongliang=len(list_links)
dangqian=0
with open(file_name, 'a', encoding='utf-8') as filesave:
for url_neirong in list_links:
dangqian=dangqian+1
print("当前处理的网址:%s 进度:%%%.2f"%(url_neirong, dangqian/zongliang*100) )
get_content(url_neirong)
filesave.write('\n') #换个行吧,好看点
filesave.write(list_biaoti[dangqian-1])
for i in list_neirong:
filesave.write('\n') #换个行吧,好看点
filesave.write(i)
time.sleep(0.8) #每次爬取一个章节后,暂停0.8秒
filesave.close()
if __name__ == '__main__':
main()
理论上这个网站的小说,只要换一下第8行代码中的url即可。
百度小说全文爬取-目前只能下载免费章节
https://www.52pojie.cn/thread-1633792-1-1.html
import time,os
import requests
import asyncio
import aiohttp
import aiofiles
async def get_chapters_ids(n_id):
book_url = f'https://dushu.baidu.com/api/pc/getCatalog?data=%7B"book_id":{n_id}%7D'
t_start = int(time.time())
tasks =[]
with requests.get(book_url) as respon:
dic = respon.json()
for i in dic['data']['novel']['items']:
title = i['title']
chapter_id = i['cid']
tasks.append(asyncio.create_task(get_chapters(n_id,title,chapter_id)))
await asyncio.wait(tasks)
t_over = int(time.time())
print('下载完毕!')
print('共用时:',t_over-t_start,'秒')
async def get_chapters(n_id,title,chapter_id):
chapter_url = f'https://dushu.baidu.com/api/pc/getChapterContent?data=%7B"book_id":"{n_id}","cid":"{n_id}|{chapter_id}","need_bookinfo":1%7D'
# print(chapter_url)
async with aiohttp.ClientSession() as req:
async with req.get(chapter_url) as respon:
dic = await respon.json()
async with aiofiles.open(f'D:\小说\{title}.txt',mode='w',encoding='utf-8') as f:
await f.write(dic['data']['novel']['content'])
print(title,'下载完成')
if __name__ =='__main__':
if not os.path.exists(r'd:\小说'):
os.mkdir(r'd:\小说')
novel_id = input('输入小说编号:')
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(get_chapters_ids(novel_id))
转载请注明:有爱前端 » [Python] 爬虫整理花呗小说网lianxinw