Python爬蟲學習筆記(九) — 多線程/多進程爬蟲

多線程

def download_all_images(list_page_urls):
works = len(list_page_urls)
with concurrent.futures.ThreadPoolExecutor(works) as exector:
for url in list_page_urls:
exector.submit(download,url)

完整程式如下

import concurrent
import os
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
def header(referer): headers = {
'Host': 'i.meizitu.net',
'Pragma': 'no-cache',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'Referer': '{}'.format(referer),
}
return headersdef request_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
def get_page_urls(): for i in range(1, 2):
baseurl = 'https://www.mzitu.com/page/{}'.format(i)
html = request_page(baseurl)
soup = BeautifulSoup(html, 'lxml')
list = soup.find(class_='postlist').find_all('li')
urls = []
for item in list:
url = item.find('span').find('a').get('href')
print('页面链接:%s' % url)
urls.append(url)
return urlsdef download_Pic(title, image_list):
# 新建文件夹
os.mkdir(title)
j = 1
# 下载图片
for item in image_list:
filename = '%s/%s.jpg' % (title, str(j))
print('downloading....%s : NO.%s' % (title, str(j)))
with open(filename, 'wb') as f:
img = requests.get(item, headers=header(item)).content
f.write(img)
j += 1
def download(url):
html = request_page(url)
soup = BeautifulSoup(html, 'lxml')
total = soup.find(class_='pagenavi').find_all('a')[-2].find('span').string
title = soup.find('h2').string
image_list = []
for i in range(int(total)):
html = request_page(url + '/%s' % (i + 1))
soup = BeautifulSoup(html, 'lxml')
img_url = soup.find('img').get('src')
image_list.append(img_url)
download_Pic(title, image_list)def download_all_images(list_page_urls):
# 获取每一个详情妹纸
# works = len(list_page_urls)
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as exector:
for url in list_page_urls:
exector.submit(download, url)
if __name__ == '__main__':
# 获取每一页的链接和名称
list_page_urls = get_page_urls()
download_all_images(list_page_urls)

多進程

def main(url):
html = request_douban(url)
soup = BeautifulSoup(html, 'lxml')
save_content(soup)
if __name__ == '__main__':
start = time.time()
urls = []
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for i in range(0, 10):
url = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='
urls.append(url)
pool.map(main, urls)
pool.close()
pool.join()

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store