Python爬蟲學習筆記(一) - Requests, BeautifulSoup, 正規表達式,API

Python 網路爬蟲與資料分析入門實戰 GitHub Repo

先備知識

GET:讀取一般網頁內容
POST:填表單傳送資料時使用

安裝Requests和BeautifulSoup

pip install beautifulsoup4
pip install requests

引入模組

import requests
from bs4 import BeautifulSoup

Requests

url = "https://jwlin.github.io/py-scraping-analysis-book/ch1/connect.html"
resp = requests.get(url)
#網頁抓取後編碼錯誤?
resp.encoding = 'utf-8' #轉換編碼至UTF-8
resp.encoding = 'big5' #設定成該網頁的編碼,例如big5編碼或簡體的gbk編碼
#顯示網頁狀態
resp.status_code
#顯示200即為正常
#通常2開頭為正常
#開頭為4或5表示錯誤
#若想解析亂碼該怎麼辦
至以下網站貼上亂碼,選擇對應的編碼,即可轉換
https://www.webatic.com/url-convertor

payload:使傳回資料以指定格式顯示(GET方法)

import requests
payload = {'key1': 'value1', 'key2': 'value2'}
# 將查詢參數加入 GET 請求中
html = requests.get("http://httpbin.org/get", params=payload)
print(html.url) # http://httpbin.org/get?key1=value1&key2=value2

payload:使傳回資料以指定格式顯示(POST方法)

import requests
payload = {'key1': 'value1', 'key2': 'value2'}
# 將查詢參數加入 POST 請求中
html = requests.post("http://httpbin.org/post", data=payload)
print(html.text) #以json格式呈現

操作Cookie

import requests
from bs4 import BeautifulSoup
payload = {
'from': '
https://www.ptt.cc/bbs/Gossiping/index.html',
'yes': 'yes'
}

headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
rs = requests.Session()
rs.post('
https://www.ptt.cc/ask/over18', data=payload, headers=headers)
res = rs.get('https://www.ptt.cc/bbs/Gossiping/index.html', headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
items = soup.select('.r-ent')
for item in items:
print(item.select('.date')[0].text, item.select('.author')[0].text, item.select('.title')[0].text)

翻頁

#使用for迴圈翻頁,
for i in range(1,100):
url='http://example.com/page'+str(i)

判斷網頁是否更新

import requests
import os
import json
import hashlib
def save_newaqi():
'''儲存newaqi.json'''
with open(fn, 'w') as f:
json.dump(aqijsons.json(),f) # 寫入json檔案至newaqi.json
def save_hashvalue():
'''儲存哈希值至hashvalue.txt'''
with open(fn_hash, 'w') as fileobj:
fileobj.write(newhash) # 寫入哈希值至hashvalue.txt
def cal_hashvalue():
''' 計算hash value'''
data = hashlib.md5()
data.update(aqijsons.text.encode('utf-8'))
hashdata = data.hexdigest()
return hashdata # 傳回哈希值
url = 'http://opendata.epa.gov.tw/webapi/Data/REWIQA/?$orderby=SiteName&$\
skip=0&$top=1000&format=json'
try:
aqijsons = requests.get(url) # 將檔案下載至aqijsons
print('下載成功')
except Exception as err:
print('下載失敗')
fn = 'newaqi.json'
fn_hash = 'hashvalue.txt' # 檔案名稱
if os.path.exists(fn_hash): # 如果hashvalue.txt存在
newhash = cal_hashvalue() # 計算新的哈希值hashvalue
print('newhash = ',newhash)
# 開啟hashvalue.txt檔案 with open(fn_hash, 'r') as fnObj: # 讀取舊的哈希值
oldhash = fnObj.read()
print('oldhash = ', oldhash)
if newhash == oldhash: # 比對新舊哈希值
print('環保署空氣品質資料未更新')
else:
print('環保署空氣品質資料已經更新')
save_newaqi() # 儲存newaqi.son
save_hashvalue() # 儲存哈希值至hashvalue.txt
else: # 如果hashvalue.txt不存在
print('第一次啟動此程式')
newhvalue = cal_hashvalue()
print('哈希值 = ', newvalue)
save_hashvalue() # 儲存哈希值至hashvalue.txt
save_newaqi() # 儲存newaqi.son

使用代理伺服器

import urllib2

proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
response = urllib2.urlopen('http://www.baidu.com')
print(response.read())

處理防爬蟲機制 (User-agent)

#如果希望爬取較精簡的網頁(通常手機版較精簡),可以考慮使用手機的header#使用假Headerheaders = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}resp = requests.get(url, headers=headers) 
resp.encoding = 'utf8'
print(resp.text)
#隨機選擇header
import random
headerlist = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 OPR/43.0.2442.991",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 OPR/42.0.2393.94",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36 OPR/47.0.2631.39",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"]
#隨機選擇headers
user_agent = random.choice(headerlist)
headers = {'User-Agent': user_agent}
#使用Google的headers https://support.google.com/webmasters/answer/1061943?hl=zh-Hanthttps://developers.whatismybrowser.com/useragents/explore/software_name/googlebot/https://searchengineland.com/google-to-update-googlebots-user-agent-322859

防止一秒太多次Requests

import requests
import time
import random
url = "XXXXXXXXXXXXXXXXX"
contextLi = []
i=0
while i < 10:
re = requests.get(url)
re.encoding = 'utf8'
contextLi.append(re.text)
i += 1
print(i , " succeed")
time.sleep(2) #休息兩秒
time.sleep(random.randint(1,10)) #休息1~10秒之間

需要帳號登入的網站

import requests
url = 'XXXXXXXXXXXXXXXXXXXXXXXXXXX'
#自訂headers
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}

#將自訂headers加入 GET 請求中
r = requests.get(url, headers = my_headers)

#需要帳號登入的網頁

r = requests.get(url, auth=('帳號', '密碼'))

#等待 3 秒無回應則放棄
requests.get('http://github.com/', timeout = 3)
--------------------------------------------------------------------
使用Session方法登入網站
session = requests.Session()
payload = {'username': 'Morvan', 'password': 'password'}
r = session.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())
r = session.get("http://pythonscraping.com/pages/cookies/profile.php")
print(r.text)

使用API

import requests
url = 'XXXXXXXXXXXXXXXXXXXXXXXXXXX'
token ='yourAPI_KEY'
r = requests.get(url, headers={"token":token})

被封鎖IP了嗎

from bs4 import BeautifulSoup
import requests
import random
if __name__ == '__main__':
# 代理伺服器查詢: http://cn-proxy.com/
proxy_ips = ['51.15.227.220:3128', '81.162.56.154:8081']
ip = random.choice(proxy_ips)
print('Use', ip)
resp = requests.get('http://ip.filefab.com/index.php',
proxies={'http': 'http://' + ip})
soup = BeautifulSoup(resp.text, 'html5lib')
print(soup.find('h1', id='ipd').text.strip())

BeautifulSoup

#建立爬取對象
soup = BeautifulSoup(resp.text, 'lxml')
#解析器:lxml(官方推薦,速度最快)
#或者建立本地HTML對象
soup = BeautifulSoup(open('index.html'))
# 從檔案讀取 HTML 程式碼進行解析
with open("index.html") as f:
soup = BeautifulSoup(f)

印出完整HTML

#輸出排版後的 HTML 程式碼
print(soup.prettify())

用soup.find() [只能找單一元素,可回傳成Text或Str屬性]
用soup.find_all() [找所有元素,回傳符合內容成1個List]

resp = requests.get('https://jwlin.github.io/py-scraping-analysis-book/ch1/connect.html')
soup = BeautifulSoup(resp.text, 'html.parser')
#印出h1標籤
soup.find('h1').text #印出第一個h1標籤的文字
soup.find_all('h1')[0].get_text()) #印出所有h1標籤的文字
soup.find('h4',{'class':'pk'}) #印出h4標籤且class為pk的內容
soup.find(id='link2')
#find用.text顯示文字
#find用.get_text()顯示文字(資料型態須為字串格式)
#soup.find('h4')等同於soup.h4
# 根據 id 搜尋
soup.find(id='link2')
#印出段落p內文
soup.find('p').text
#搜尋多個標籤
tags = soup.find_all(["title", "p"])
#限制搜尋結果數量
tags = soup.find_all(["title", "p"], limit=2)
# 搜尋h3標籤且class為 boldtext 的內容
soup.find_all("h3", class_="boldtext")
# 搜尋 class 為 outer-text的內容
soup.find_all(class_="outer-text")
#class用class_表示的原因是因為class是Python保留字#輸出所有超連結URL
a_tags = soup.find_all('a')
for tag in a_tags:
print(tag.get('href'))
#以下寫法都相同功能
soup.find_all("h4", {"class": "card-title"})
soup.find_all("h4", class_ = "card-title")
soup.find_all("h4", "card-title")
#.find().find_all()
find完element後,再針對element進行find_all()的動作
例如:soup.find("table").find_all("tr")
找到table標籤底下的所有tr標籤
注意不可寫成.find_all().find()
#實用的for loop寫法,一口氣抓出所有內容文字
[tag.text for tag in soup.find_all("tag")]

select()方法 [id用#、class用.、沒用#或.則視為標籤] - CSS

#查所有的CSS
bs.select('h4') #選擇h4標籤
bs.select('#books') #選擇id為books
bs.select('#books') #選擇所有p標籤底下,且id為books的元素
bs.select('.PK') #選擇class為PK
bs.select('h4.bk') #選擇class為PK的h4標籤
bs.select('img') #所有img標籤
bs.select("p.strikeout.body")

印標題和內文 — 直接選擇標籤Tag

#印出標題
print(soup.title)
#印出屬性
print(soup.title.name)
#印出數值
print(soup.title.string)
#印出head

print(soup.head)
#印出a標籤
print(soup.a)
#印出p標籤
print(soup.p)
#印出表格table
print(soup.table)
print(soup.find('table'))
#印出表格中的所有文字內容
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
#印出p標籤的文字
print(soup.p.string)
#印出網頁中的所有文字
soup.get_text()
#soup.a和soup.a.string的差別
print(soup.a) #第一行印出a標籤的「程式碼」
print(soup.a.string) #第二行印出a標籤中的「文字」

獲得屬性名稱的內容 .get()

for link in soup.find_all('a'):
print(link.get('href'))
#result:
http://example.com/XXX1
http://example.com/XXX2
http://example.com/XXX3

取得文字.get_text()

print(soup.get_text())
#result:
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...

使用正規表達式

常見的 rule
Email: [A-Za-z0-9\._+]+@[A-Za-z0-9\._]+\.(com|org|edu|net)
URL: http(s)?://[A-Za-z0-9\./_]+
所有中文字(不含標點符號): [\u4e00-\u9fa5]+
或者Google 關鍵字找規則
--------------------------------------------------------------------import requests
import re
from bs4 import BeautifulSoup
def main():
resp = requests.get('http://jwlin.github.io/py-scraping-analysis-book/ch2/blog/blog.html')
soup = BeautifulSoup(resp.text, 'html.parser')
# 找出所有 'h' 開頭的標題文字
titles = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
for title in titles:
print(title.text.strip())
# 利用 regex 找出所有 'h' 開頭的標題文字
for title in soup.find_all(re.compile('h[1-6]')):
print(title.text.strip())
# 找出所有 .png 結尾的圖片
imgs = soup.find_all('img')
for img in imgs:
if 'src' in img.attrs:
if img['src'].endswith('.png'):
print(img['src'])
# 利用 regex 找出所有 .png 結尾的圖片
for img in soup.find_all('img', {'src': re.compile('\.png$')}):
print(img['src'])
# 找出所有 .png 結尾且含 'beginner' 的圖片
imgs = soup.find_all('img')
for img in imgs:
if 'src' in img.attrs:
if 'beginner' in img['src'] and img['src'].endswith('.png'):
print(img['src'])
# 利用 regex 找出所有 .png 結尾且含 'beginner' 的圖片
for img in soup.find_all('img', {'src': re.compile('beginner.*\.png$')}):
print(img['src'])
if __name__ == '__main__':
main()
.................................................................#爬取所有jpg圖檔的網址
soup = BeautifulSoup(html, features='lxml')
img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')})
for link in img_links:
print(link['src'])
#爬取該網域開頭的所有連結
course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')})
for link in course_links:
print(link['href'])
#可搭配urlretrieve下載爬到的圖片檔案from urllib.request import urlretrieve
urlretrieve(IMAGE_URL, './img/image1.png')

正規表達式範例

# 取出購物網站價格### html範例<ul class ="prdPrice">
<li>xxx<del>12,999</del></li)
<li class = "special">
折扣後價格<span>12,799</span>
.....
### 找出元素位置後,透過正規表達式抓取數字txt = driver.find_element_by_xpath("//li[@class='special]").textpattern = r'(\d+\,)?(\d+)'
price = re.search(pattern, txt).group()
print('商品價格:', price)

將爬蟲結果保存成CSV

import pandas as pd
df_bs = pd.DataFrame(row_list,columns=['A','B','C'])
df_bs.set_index('A',inplace=True)
df_bs.to_csv('beautifulsoup.csv')

CSV檔案在Excel開啟時是亂碼

#步驟1:加入BOM
with open('fileName.csv', 'wb') as f:
f.write(b'\xEF\xBB\xBF') #在檔頭加入UTF-8編碼的BOM
#步驟2:繼續原本要做的動作
with open('fileName.csv', 'w', encoding='utf-8', newline='') as f:
#執行檔案操作.........

Dcard API操作範例

在Chrome瀏覽器環境下進入 Dcard首頁,點擊F12->Network->XHR

此時可以發現有多個不同的URL,我們找到posts開頭的URL點擊,並點選Preview,可以看到有30篇的Dcard熱門文章以Json格式保存

例如:

{read: true, newComment: false, id: 232931749, title: “#爆卦 #高調 武漢肺炎進到台灣了!!!”,…}

read: truenewComment: falseid: 232931749title: “#爆卦 #高調 武漢肺炎進到台灣了!!!”excerpt: “截至1月20號,共有八例疑似病例,其中六人已排除,另外兩名通報個案還在檢驗中,疾管署剛剛公布檢驗結果,一位呈現陽性反應,確認是冠狀病毒,台灣出現首例武漢肺炎,目前該病人已被隔離,疾管署也說口罩很充足,”.
.
.
createdAt: “2020–01–21T10:27:05.857Z”updatedAt: “2020–01–21T10:29:20.438Z”commentCount: 3059likeCount: 12903topics: [“武漢”, “肺炎”, “台灣”, “淪陷”].forumName: “時事”forumAlias: “trending”.
.
.
gender: “M”
school: “國立交通大學”.
.
.
以下省略

可以看到有相當多的資訊可以抓取,我們也可以複製該網址的URL:

https://www.dcard.tw/_api/posts?popular=true&limit=30

再搭配pandas和matplotlib來進行各種資料分析

除了在Dcard上可以使用這個方法來爬蟲之外,其他網站的資訊也可以透過檢查XHR的方法來抓取API。

實作

import requests
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
url = 'https://www.youtube.com/channel/UCFdTiwvDjyc62DBWrlYDtlQ/videos?sort=p&view=0&flow=grid'
resp = requests.get(url,headers=headers)
soup = BeautifulSoup(resp.text, 'lxml')
target=soup.find_all('a')#存成txt檔案
txt = open('video-title.txt', 'w', encoding = 'UTF-8')
for i in target:
f=i.get_text().strip() #取得文字、去除左右的空格
txt.write(f) #寫入文字
txt.write('\n') #換行
txt.close() #關閉檔案

下載檔案

#建立資料夾
import os
os.makedirs('./img/', exist_ok=True)
IMAGE_URL = "https://morvanzhou.github.io/static/img/description/learning_step_flowchart.png"#用urlretrieve下載檔案from urllib.request import urlretrieve
urlretrieve(IMAGE_URL, './img/image1.png')
#用requests下載檔案(用於下載小檔案)
import requests
r = requests.get(IMAGE_URL)
with open('./img/image2.png', 'wb') as f:
f.write(r.content)
#用requests進行chunk下載(用於下載大檔案)
r = requests.get(IMAGE_URL, stream=True) # stream loading

with open('./img/image3.png', 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)

抓歌詞

import requests
from bs4 import BeautifulSoup
re = requests.get('https://mojim.com/twy108268x5x2.htm')

soup = BeautifulSoup(re.text, 'html.parser')
song = soup.find(id='fsZx3').text

print(song)

下載 Yahoo 頭條新聞

import requests
from bs4 import BeautifulSoup

# 下載 Yahoo 首頁內容
r = requests.get('https://tw.yahoo.com/')

# 確認是否下載成功
if r.status_code == requests.codes.ok:
# 以 BeautifulSoup 解析 HTML 程式碼
soup = BeautifulSoup(r.text, 'html.parser')

# 以 CSS 的 class 抓出各類頭條新聞
stories = soup.find_all('a', class_='story-title')
for s in stories:
# 新聞標題
print("標題:" + s.text)
# 新聞網址
print("網址:" + s.get('href'))

下載Google搜尋結果

import requests
from bs4 import BeautifulSoup

# Google 搜尋 URL
google_url = 'https://www.google.com.tw/search'

# 查詢參數
my_params = {'q': 'PM2.5'}

# 下載 Google 搜尋結果
r = requests.get(google_url, params = my_params)

# 確認是否下載成功
if r.status_code == requests.codes.ok:
# 以 BeautifulSoup 解析 HTML 原始碼
soup = BeautifulSoup(r.text, 'html.parser')

# 觀察 HTML 原始碼
# print(soup.prettify())

# 以 CSS 的選擇器來抓取 Google 的搜尋結果
items = soup.select('div.g > h3.r > a[href^="/url"]')
for i in items:
# 標題
print("標題:" + i.text)
# 網址
print("網址:" + i.get('href'))

Google股票資訊

import requests
from bs4 import BeautifulSoup
# 網址後方加上 MARKET:STOCK_ID 即為個股資訊. e.g, TPE:2330
G_FINANCE_URL = 'https://www.google.com/search?q='
def get_web_page(url, stock_id):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/66.0.3359.181 Safari/537.36'}
resp = requests.get(url + stock_id, headers=headers)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.text
def get_stock_info(dom):
soup = BeautifulSoup(dom, 'html5lib')
stock = dict()
sections = soup.find_all('g-card-section')# 第 2 個 g-card-section, 取出公司名及即時股價資訊
stock['name'] = sections[1].div.text
spans = sections[1].find_all('div', recursive=False)[1].find_all('span', recursive=False)
stock['current_price'] = spans[0].text
stock['current_change'] = spans[1].text
# 第 4 個 g-card-section, 有左右兩個 table 分別存放股票資訊
for table in sections[3].find_all('table'):
for tr in table.find_all('tr')[:3]:
key = tr.find_all('td')[0].text.lower().strip()
value = tr.find_all('td')[1].text.strip()
stock[key] = value
return stockif __name__ == '__main__':
page = get_web_page(G_FINANCE_URL, 'TPE:2330')
if page:
stock = get_stock_info(page)
for k, v in stock.items():
print(k, v)

蘋果日報及自由時報今日焦點

import requests
from bs4 import BeautifulSoup
def main():
print('蘋果今日焦點')
dom = requests.get('http://www.appledaily.com.tw/appledaily/hotdaily/headline').text
soup = BeautifulSoup(dom, 'html5lib')
for ele in soup.find('ul', 'all').find_all('li'):
print(
ele.find('div', 'aht_title_num').text,
ele.find('div', 'aht_title').text,
ele.find('div', 'aht_pv_num').text
)
print('-----------')
print('自由今日焦點')
dom = requests.get('http://news.ltn.com.tw/list/newspaper').text
soup = BeautifulSoup(dom, 'html5lib')
for ele in soup.find('ul', 'list').find_all('li'):
print(ele.find('a', 'tit').text.strip())
if __name__ == '__main__':
main()

PTT表特版爬蟲

#講解如何繞過cookies的18歲限制

Chrome->F12->Network->Ctrl+R->index.html->Headers->cookie,
找到over18=1,將此內容放到如下地方
resp = requests.get(url=url,cookies={‘over18’: ‘1’})

import requests
import time
from bs4 import BeautifulSoup
import os
import re
import urllib.request
import json
PTT_URL = 'https://www.ptt.cc'def get_web_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'} #繞過滿18歲限制
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.text
def get_articles(dom, date):
soup = BeautifulSoup(dom, 'html5lib')
# 取得上一頁的連結
paging_div = soup.find('div', 'btn-group btn-group-paging')
prev_url = paging_div.find_all('a')[1]['href']
articles = [] # 儲存取得的文章資料
divs = soup.find_all('div', 'r-ent')
for d in divs:
if d.find('div', 'date').text.strip() == date: # 發文日期正確
# 取得推文數
push_count = 0
push_str = d.find('div', 'nrec').text
if push_str:
try:
push_count = int(push_str) # 轉換字串為數字
except ValueError:
# 若轉換失敗,可能是'爆'或 'X1', 'X2', ...
# 若不是, 不做任何事,push_count 保持為 0
if push_str == '爆':
push_count = 99
elif push_str.startswith('X'):
push_count = -10
# 取得文章連結及標題
if d.find('a'): # 有超連結,表示文章存在,未被刪除
href = d.find('a')['href']
title = d.find('a').text
author = d.find('div', 'author').text if d.find('div', 'author') else ''
articles.append({
'title': title,
'href': href,
'push_count': push_count,
'author': author
})
return articles, prev_url
def parse(dom):
soup = BeautifulSoup(dom, 'html.parser')
links = soup.find(id='main-content').find_all('a')
img_urls = []
for link in links:
if re.match(r'^https?://(i.)?(m.)?imgur.com', link['href']):
img_urls.append(link['href'])
return img_urls
def save(img_urls, title):
if img_urls:
try:
dname = title.strip() # 用 strip() 去除字串前後的空白
os.makedirs(dname)
for img_url in img_urls:
# e.g. 'http://imgur.com/A2wmlqW.jpg'.split('//') -> ['http:', 'imgur.com/A2wmlqW.jpg']
if img_url.split('//')[1].startswith('m.'):
img_url = img_url.replace('//m.', '//i.')
if not img_url.split('//')[1].startswith('i.'):
img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1]
if not img_url.endswith('.jpg'):
img_url += '.jpg'
fname = img_url.split('/')[-1]
urllib.request.urlretrieve(img_url, os.path.join(dname, fname))
except Exception as e:
print(e)
if __name__ == '__main__':
current_page = get_web_page(PTT_URL + '/bbs/Beauty/index.html')
if current_page:
articles = [] # 全部的今日文章
date = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
current_articles, prev_url = get_articles(current_page, date) # 目前頁面的今日文章
while current_articles: # 若目前頁面有今日文章則加入 articles,並回到上一頁繼續尋找是否有今日文章
articles += current_articles
current_page = get_web_page(PTT_URL + prev_url)
current_articles, prev_url = get_articles(current_page, date)
# 已取得文章列表,開始進入各文章讀圖
for article in articles:
print('Processing', article)
page = get_web_page(PTT_URL + article['href'])
if page:
img_urls = parse(page)
save(img_urls, article['title'])
article['num_image'] = len(img_urls)
# 儲存文章資訊
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(articles, f, indent=10, sort_keys=True, ensure_ascii=False)

PTT食物版標題抓取

import requests
from bs4 import BeautifulSoup
res = requests.get('https://www.ptt.cc/bbs/Food/')
soup = BeautifulSoup(res.text,'lxml')
for item in soup.select(".r-ent"):
print(item.select(".title")[0].text.strip())

Youtube熱門影片標題爬蟲,並存至txt當中

import requests
from bs4 import BeautifulSoup
url="https://www.youtube.com/feed/trending/"request=requests.get(url)
content=request.content
soup=BeautifulSoup(content,"html.parser")
container = soup.select("h3 a")# 寫入result.txt檔案
file = open('result.txt','w')
for item in container:
if item:
value = item.get_text()
print(value)
file.write(value+'\n')
file.close()
#存成txt檔案
txt = open('video-title.txt', 'w', encoding = 'UTF-8')
for i in target:
f=i.get_text().strip() #取得文字、去除左右的空格
txt.write(f) #寫入文字
txt.write('\n') #換行
txt.close() #關閉檔案

電子書網頁標題翻頁爬蟲

import requests
from bs4 import BeautifulSoup
for i in range(1,50): #第1頁到第49頁
i=str(i) #轉成字串
url="http://www.allitebooks.org/page/"+i
print(url)
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
titles = soup.find_all('h2')
for t in titles:
print(t.string)

Written by

Machine Learning / Deep Learning / Python / Flutter cakeresume.com/yanwei-liu

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store