Python爬蟲學習筆記(四) — 使用Scrapy
4 min readMay 3, 2019
安裝
pip install Scrapy
定義Spider函數
#作為爬蟲的主體import scrapy
class NewsSpider(scrapy.Spider):
name = 'news'
...
定義Item函數
#決定爬取的資料該有哪些import scrapy
class Article(scrapy.Item):
headline = scrapy.Field()
...
爬取Reddit圖片
import scrapy
class RedditSpider(scrapy.Spider):
name = 'reddit'
start_urls = [
'https://www.reddit.com'
]
def parse(self, response):
links = response.xpath('//img/@src')
html = ''
for link in links:
# Extract the URL text from the element
url = link.get()
# Check if the URL contains an image extension
if any(extension in url for extension in ['.jpg', '.gif', '.png'])\
and not any(domain in url for domain in ['redditstatic.com', 'redditmedia.com']):
html += '''
<a href="{url}" target="_blank">
<img src="{url}" height="33%" width="33%" />
</a>
'''.format(url=url)
# Open an HTML file, save the results
with open('frontpage.html', 'w') as page:
page.write(html)
# Close the file
page.close()
爬蟲Amazon產品價格
import scrapy
from re import sub
from decimal import Decimal
def convert_money(money):
return Decimal(sub(r'[^\d.]', '', money))
class AmazonSpider(scrapy.Spider):
name = 'amazon'
start_urls = [
'https://www.amazon.com/s?k=paint'
]
def parse(self, response):
# Find the Amazon price element
prices = response.css('.a-price .a-offscreen::text').getall()
# Initialize some counters and stats objects
stats = dict()
values = []
for price in prices:
value = convert_money(price)
values.append(value)
# Sort our values before calculating
values.sort()
# Calculate price statistics
stats['average_price'] = round(sum(values) / len(values), 2)
stats['lowest_price'] = values[0]
stats['highest_price'] = values[-1]
Stats['total_prices'] = len(values)
print(stats)