---- AI试用 ---域名问题某些图片和js资源无法访问,导致一些代码实例无法运行!(代码里gzui.net换成momen.vip即可)

[Python] 爬虫整理花呗小说网lianxinw

Python 蚂蚁 846℃ 0评论

https://www.52pojie.cn/thread-1623166-1-1.html
这个网站的小说是这种样子的,也没有提供txt下载。
我的一个朋友想看,一次次点击很不方便 ,想让我帮忙下载个合集。我就写了个爬虫,整理到一个记事本中。

import requests
from bs4 import BeautifulSoup as bsp
import time
list_links=[]
list_biaoti=[]
list_neirong=[]
file_name='save.txt'
url=r'http://www.lianxinw.com/book/64221/'
def get_links(url:str): #获取所有的链接
    qingqiu=requests.get(url=url)
    wangye=bsp(qingqiu.text, 'lxml')
    links=wangye.find_all('a', rel='chapter')
    for i in links:
        jieguo=i.get('href')
        if r'/' in jieguo:
            jieguo=jieguo.split(r'/')[-1]
        list_links.append(url+jieguo)
        list_links.sort()  #爬取时候不知道为什么没按顺序,也可能是解析时候python的异步处理导致的

def get_content(url: str): #从一个url中,获取正文
    #pipei=re.compile(r'<div id="content">(.*?)</div>')
    list_neirong.clear()
    qingqiu = requests.get(url=url)
    wangye = bsp(qingqiu.text, 'lxml')
    biaoti=wangye.find('h1').get_text('h1')
    list_biaoti.append(biaoti)
    jieguo= wangye.find('div', id="content")
    for i in jieguo.find_all('p'):
        list_neirong.append( i.get_text('p') )

def main():
    get_links(url)
    zongliang=len(list_links)
    dangqian=0
    with open(file_name, 'a', encoding='utf-8') as filesave:
        for url_neirong in list_links:
            dangqian=dangqian+1
            print("当前处理的网址:%s  进度:%%%.2f"%(url_neirong, dangqian/zongliang*100) )
            get_content(url_neirong)
            filesave.write('\n')  #换个行吧,好看点
            filesave.write(list_biaoti[dangqian-1])
            for i in list_neirong:
                filesave.write('\n')  #换个行吧,好看点
                filesave.write(i)
            time.sleep(0.8) #每次爬取一个章节后,暂停0.8秒
    filesave.close()

if __name__ == '__main__':
    main()

理论上这个网站的小说,只要换一下第8行代码中的url即可。

百度小说全文爬取-目前只能下载免费章节
https://www.52pojie.cn/thread-1633792-1-1.html

import time,os
import requests
import asyncio
import aiohttp
import aiofiles

async def get_chapters_ids(n_id):
    book_url = f'https://dushu.baidu.com/api/pc/getCatalog?data=%7B"book_id":{n_id}%7D'
    t_start = int(time.time())
    tasks =[]
    with requests.get(book_url) as respon:
        dic = respon.json()
        for i in dic['data']['novel']['items']:
            title = i['title']
            chapter_id = i['cid']
            tasks.append(asyncio.create_task(get_chapters(n_id,title,chapter_id)))
        await asyncio.wait(tasks)
    t_over = int(time.time())
    print('下载完毕!')
    print('共用时:',t_over-t_start,'秒')

async def get_chapters(n_id,title,chapter_id):
    chapter_url = f'https://dushu.baidu.com/api/pc/getChapterContent?data=%7B"book_id":"{n_id}","cid":"{n_id}|{chapter_id}","need_bookinfo":1%7D'
    # print(chapter_url)
    async with aiohttp.ClientSession() as req:
        async with req.get(chapter_url) as respon:
            dic = await respon.json()
            async with aiofiles.open(f'D:\小说\{title}.txt',mode='w',encoding='utf-8') as f:
                await f.write(dic['data']['novel']['content'])
    print(title,'下载完成')

if __name__ =='__main__':
    if not os.path.exists(r'd:\小说'):
        os.mkdir(r'd:\小说')
    novel_id = input('输入小说编号:')
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    loop.run_until_complete(get_chapters_ids(novel_id))

转载请注明:有爱前端 » [Python] 爬虫整理花呗小说网lianxinw

喜欢 (0)or分享 (0)