https://www.moerats.com/archives/501/
import requests,re,os
import threading,json
'''
1256开始规则变化
'''
class myThread (threading.Thread): #继承父类threading.Thread
def __init__(self, url, dir, filename,headers):
threading.Thread.__init__(self)
self.threadID = filename
self.url = url
self.dir = dir
self.filename=filename
self.headers=headers
def run(self): #把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
download_pic(self.url,self.dir,self.filename,self.headers)
def download_pic(url,dir,filename,headers):
req=requests.get(url=url,headers=headers)
if req.status_code==200:
with open(dir+'/'+str(filename)+'.jpg','wb') as f:
f.write(req.content)
class spider:
def __init__(self):
self.page='http://www.mmjpg.com/mm/'
self.img='http://img.mmjpg.com/'
self.file='.jpg'
def del_main(self):
flag=1
while True:
page_url=self.page+str(flag)
headers={'Referer':page_url,'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'}
req=requests.get(page_url,headers=headers)
if req.status_code==200:
num_page=[]
img_tittle=[]
num_page=re.findall(r'picinfo = \[(.*?),(.*?),(.*?),(.*?)\];',req.text)
img_tittle=re.findall(r'<h2>(.*?)</h2>',str(req.content,'utf-8'))
if os.path.exists(str(img_tittle[0]))==False:
os.makedirs(img_tittle[0])
if num_page[0][3]=='0':
threads=[]
download_img=self.img+num_page[0][0]+'/'+num_page[0][1]+'/'
print('开始下载:'+img_tittle[0])
for i in range(1,int(num_page[0][2])+1):
download_img_url=download_img+str(i)+self.file
thread=myThread(download_img_url,img_tittle[0],str(i),headers)
thread.start()
threads.append(thread)
for t in threads:
t.join()
print('下载完成')
else:
data='http://www.mmjpg.com/data.php?id='+num_page[0][1]+'&page=8999'
req_data=requests.get(data,headers=headers)
names=req_data.text.split(',')
threads=[]
download_img=self.img+num_page[0][0]+'/'+num_page[0][1]+'/'
print('开始下载:'+img_tittle[0])
for i in range(1,int(num_page[0][2])+1):
download_img_url=download_img+str(i)+'i'+names[i-1]+self.file
thread=myThread(download_img_url,img_tittle[0],str(i),headers)
thread.start()
threads.append(thread)
for t in threads:
t.join()
print('下载完成')
print('已下载'+str(flag)+'套图片')
flag+=1
else:
print('文件夹已存在,跳过')
flag+=1
continue
else:
print('可能出现错误,或者已经爬完')
break
def main():
a=spider()
a.del_main()
if __name__=='__main__':
main()
import asyncio
import base64
import os
import re
from lxml import etree
import requests
import aiohttp
import aiofiles
import time
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
}
async def img_download(title, img_list):
title = ''.join(re.findall(r'[\u4e00-\u9fa5]', title)) # title去掉特殊符号作为文件夹名称
if not os.path.exists(f'./img/{title}'):
os.makedirs(f'./img/{title}')
for img_url in img_list:
img_url = img_url.replace('thumb300', 'mw1024') # 解析大图
async with aiohttp.ClientSession() as session:
async with await session.get(img_url, headers=headers) as response:
resp = await response.read()
img_name = title + img_url.split('/')[-1]
path = './img/' + title + '/' + img_name
async with aiofiles.open(path, 'wb') as fp:
await fp.write(resp)
print(img_name + "--下载完成")
async def request_list_page(list_url):
resp = requests.get(list_url, headers=headers).text
tree = etree.HTML(resp)
datas = tree.xpath('//article')
tasks = []
for data in datas:
title = data.xpath('./div/h2/a/text()')[0]
img_list = data.xpath('./div/div[2]/div//div/img/@data-src')
tasks.append(asyncio.create_task(img_download(title, img_list)))
await asyncio.wait(tasks)
start_time = time.time()
if __name__ == '__main__':
for i in range(1, 61):
url = str(base64.b64decode(b'aHR0cHM6Ly9tbXp6dHQuY29tL2JlYXV0eS9mYXNoaW9uL3BhZ2Uv')).split("\'")[1] + str(i)
loop = asyncio.get_event_loop()
loop.run_until_complete(request_list_page(url))
# asyncio.run(request_list_page(url))
print("下载完毕,总耗时秒:" + str(time.time() - start_time))
https://www.52pojie.cn/thread-1590284-1-1.html
'''
@Description: 多线程爬虫
@Author: Levin-e
@Date: 2020-03-23 22:31:38
'''
import math
import re
import threading
import time
import urllib.request
from multiprocessing.dummy import Pool as ThreadPool
import requests
from bs4 import BeautifulSoup
from lxml import etree
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36 Edg/80.0.361.54'
}
def get_info(url):
html = get_html(url)
selector = etree.HTML(html)
info = selector.xpath('//div[@id="dinfo"]/span/text()')
pic_num = re.findall(r'\d+', info[0])
page_num = math.ceil(int(pic_num[0])/3)
print('该套图总共有%s张' % pic_num[0])
print('该套图总共有%d页' % page_num)
return page_num
def get_html(url):
index = requests.get(url, headers=header)
return index.text
def get_href(url): # 获取图片下载地址
html = get_html(url)
selector = etree.HTML(html)
data = selector.xpath('//ul/img/@src')
for url in data:
url = url.replace('/s', '')
start_thread_save_img(data) # 调用多线程的函数
def save_img(img_src):
img_src = img_src.replace('/s', '')
print("正在下载 ->", img_src)
urllib.request.urlretrieve(
img_src, 'E:/photo/%s' % img_src.split('/')[-1])
# 地址格式记得更改,按照下面的例子自己替换,找个文件夹存放你的图
# 地址不存在是会报错的!!!
# 示例: E:/photo/
def start_thread_save_img(img_url_list):
pool = ThreadPool(3) # 一页三张图,选三就行了,大于这个数值是浪费的
pool.map(save_img, img_url_list)
pool.close()
def main():
url = "https://www.nvshens.net/g/30991/" # 目标网站的地址格式,千万要注意第一页和之后的页是不一样的
info = get_info(url)
start_time = time.time()
get_href(url)
second_url = "https://www.nvshens.net/g/30991/{}.html" # 上下都需要改地址
for i in range(2, info+1): # 如果下载到一半失败了,自己在这动手改页码继续下吧。。。
get_href(second_url.format(i))
time.sleep(2) # 这个数值不要太小,对别人的服务器好点,留点喘息的时间
end_time = time.time()
print("共耗时:", end_time-start_time)
if __name__ == '__main__':
main()
https://www.52pojie.cn/thread-1594038-1-1.html
转载请注明:有爱前端 » Python爬取美女图片