Python爬蟲學習筆記(四) — 使用Scrapy


pip install Scrapy


#作為爬蟲的主體import scrapy

class NewsSpider(scrapy.Spider):
name = 'news'


#決定爬取的資料該有哪些import scrapy

class Article(scrapy.Item):
headline = scrapy.Field()


import scrapy

class RedditSpider(scrapy.Spider):
name = 'reddit'
start_urls = [

def parse(self, response):
links = response.xpath('//img/@src')
html = ''

for link in links:
# Extract the URL text from the element
url = link.get()
# Check if the URL contains an image extension
if any(extension in url for extension in ['.jpg', '.gif', '.png'])\
and not any(domain in url for domain in ['', '']):
html += '''
<a href="{url}" target="_blank">
<img src="{url}" height="33%" width="33%" />

# Open an HTML file, save the results
with open('frontpage.html', 'w') as page:

# Close the file


import scrapy

from re import sub
from decimal import Decimal

def convert_money(money):
return Decimal(sub(r'[^\d.]', '', money))

class AmazonSpider(scrapy.Spider):
name = 'amazon'
start_urls = [

def parse(self, response):
# Find the Amazon price element
prices = response.css('.a-price .a-offscreen::text').getall()

# Initialize some counters and stats objects
stats = dict()
values = []

for price in prices:
value = convert_money(price)

# Sort our values before calculating

# Calculate price statistics
stats['average_price'] = round(sum(values) / len(values), 2)
stats['lowest_price'] = values[0]
stats['highest_price'] = values[-1]
Stats['total_prices'] = len(values)


