Python爬蟲學習筆記(四) — 使用Scrapy

安裝

pip install Scrapy

定義Spider函數

#作為爬蟲的主體import scrapy

class NewsSpider(scrapy.Spider):
name = 'news'
...

定義Item函數

#決定爬取的資料該有哪些import scrapy

class Article(scrapy.Item):
headline = scrapy.Field()
...

爬取Reddit圖片

import scrapy

class RedditSpider(scrapy.Spider):
name = 'reddit'
start_urls = [
'https://www.reddit.com'
]

def parse(self, response):
links = response.xpath('//img/@src')
html = ''

for link in links:
# Extract the URL text from the element
url = link.get()
# Check if the URL contains an image extension
if any(extension in url for extension in ['.jpg', '.gif', '.png'])\
and not any(domain in url for domain in ['redditstatic.com', 'redditmedia.com']):
html += '''
<a href="{url}" target="_blank">
<img src="{url}" height="33%" width="33%" />
</a>
'''.format(url=url)

# Open an HTML file, save the results
with open('frontpage.html', 'w') as page:
page.write(html)

# Close the file
page.close()

爬蟲Amazon產品價格

import scrapy

from re import sub
from decimal import Decimal


def convert_money(money):
return Decimal(sub(r'[^\d.]', '', money))


class AmazonSpider(scrapy.Spider):
name = 'amazon'
start_urls = [
'https://www.amazon.com/s?k=paint'
]

def parse(self, response):
# Find the Amazon price element
prices = response.css('.a-price .a-offscreen::text').getall()

# Initialize some counters and stats objects
stats = dict()
values = []

for price in prices:
value = convert_money(price)
values.append(value)

# Sort our values before calculating
values.sort()

# Calculate price statistics
stats['average_price'] = round(sum(values) / len(values), 2)
stats['lowest_price'] = values[0]
stats['highest_price'] = values[-1]
Stats['total_prices'] = len(values)

print(stats)

Written by

Machine Learning / Deep Learning / Python / Flutter cakeresume.com/yanwei-liu

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store