Python爬蟲學習筆記(一) - Requests, BeautifulSoup, 正規表達式,API
Python 網路爬蟲與資料分析入門實戰 GitHub Repo
解決複雜圖片驗證碼captcha和recaptcha
先備知識
GET:讀取一般網頁內容
POST:填表單傳送資料時使用
安裝Requests和BeautifulSoup
pip install beautifulsoup4
pip install requests
引入模組
import requests
from bs4 import BeautifulSoup
Requests
使用Requests
url = "https://jwlin.github.io/py-scraping-analysis-book/ch1/connect.html"
resp = requests.get(url)#網頁抓取後編碼錯誤?
resp.encoding = 'utf-8' #轉換編碼至UTF-8
resp.encoding = 'big5' #設定成該網頁的編碼,例如big5編碼或簡體的gbk編碼#顯示網頁狀態
resp.status_code
#顯示200即為正常
#通常2開頭為正常
#開頭為4或5表示錯誤#若想解析亂碼該怎麼辦
至以下網站貼上亂碼,選擇對應的編碼,即可轉換
https://www.webatic.com/url-convertor
params:加入參數
import requestsr = requests.get('https://www.google.com/search',
headers={'Accept': 'application/json'}
params={'q': 'unicorns'} # 搜尋 google.com/search?q=unicorns
)
requests.content / requests.text / requests.json()
import requests
r = requests.get('
https://official-joke-api.appspot.com/random_joke'
)r.content #return raw bytes data
r.text #return string
r.json() #return json (dict)
payload:使傳回資料以指定格式顯示(GET方法)
import requests
payload = {'key1': 'value1', 'key2': 'value2'}
# 將查詢參數加入 GET 請求中
html = requests.get("http://httpbin.org/get", params=payload)
print(html.url) # http://httpbin.org/get?key1=value1&key2=value2
payload:使傳回資料以指定格式顯示(POST方法)
import requests
payload = {'key1': 'value1', 'key2': 'value2'}
# 將查詢參數加入 POST 請求中
html = requests.post("http://httpbin.org/post", data=payload)
print(html.text) #以json格式呈現
操作Cookie
import requests
from bs4 import BeautifulSoup
payload = {
'from': 'https://www.ptt.cc/bbs/Gossiping/index.html',
'yes': 'yes'
}
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
rs = requests.Session()
rs.post('https://www.ptt.cc/ask/over18', data=payload, headers=headers)
res = rs.get('https://www.ptt.cc/bbs/Gossiping/index.html', headers=headers)soup = BeautifulSoup(res.text, 'html.parser')
items = soup.select('.r-ent')
for item in items:
print(item.select('.date')[0].text, item.select('.author')[0].text, item.select('.title')[0].text)
翻頁
#使用for迴圈翻頁,
for i in range(1,100):
url='http://example.com/page'+str(i)
判斷網頁是否更新
import requests
import os
import json
import hashlibdef save_newaqi():
'''儲存newaqi.json'''
with open(fn, 'w') as f:
json.dump(aqijsons.json(),f) # 寫入json檔案至newaqi.json
def save_hashvalue():
'''儲存哈希值至hashvalue.txt'''
with open(fn_hash, 'w') as fileobj:
fileobj.write(newhash) # 寫入哈希值至hashvalue.txt
def cal_hashvalue():
''' 計算hash value'''
data = hashlib.md5()
data.update(aqijsons.text.encode('utf-8'))
hashdata = data.hexdigest()
return hashdata # 傳回哈希值url = 'http://opendata.epa.gov.tw/webapi/Data/REWIQA/?$orderby=SiteName&$\
skip=0&$top=1000&format=json'try:
aqijsons = requests.get(url) # 將檔案下載至aqijsons
print('下載成功')
except Exception as err:
print('下載失敗')fn = 'newaqi.json'
fn_hash = 'hashvalue.txt' # 檔案名稱
if os.path.exists(fn_hash): # 如果hashvalue.txt存在
newhash = cal_hashvalue() # 計算新的哈希值hashvalue
print('newhash = ',newhash)# 開啟hashvalue.txt檔案 with open(fn_hash, 'r') as fnObj: # 讀取舊的哈希值
oldhash = fnObj.read()
print('oldhash = ', oldhash)
if newhash == oldhash: # 比對新舊哈希值
print('環保署空氣品質資料未更新')
else:
print('環保署空氣品質資料已經更新')
save_newaqi() # 儲存newaqi.son
save_hashvalue() # 儲存哈希值至hashvalue.txt
else: # 如果hashvalue.txt不存在
print('第一次啟動此程式')
newhvalue = cal_hashvalue()
print('哈希值 = ', newvalue)
save_hashvalue() # 儲存哈希值至hashvalue.txt
save_newaqi() # 儲存newaqi.son
使用代理伺服器
import urllib2
proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
response = urllib2.urlopen('http://www.baidu.com')
print(response.read())
處理防爬蟲機制 (User-agent)
#如果希望爬取較精簡的網頁(通常手機版較精簡),可以考慮使用手機的header#使用假Headerheaders = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}resp = requests.get(url, headers=headers)
resp.encoding = 'utf8'
print(resp.text)#隨機選擇header
import randomheaderlist = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 OPR/43.0.2442.991",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 OPR/42.0.2393.94",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36 OPR/47.0.2631.39",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"]#隨機選擇headers
user_agent = random.choice(headerlist)
headers = {'User-Agent': user_agent}#使用Google的headers https://support.google.com/webmasters/answer/1061943?hl=zh-Hanthttps://developers.whatismybrowser.com/useragents/explore/software_name/googlebot/https://searchengineland.com/google-to-update-googlebots-user-agent-322859
防止一秒太多次Requests
import requests
import time
import randomurl = "XXXXXXXXXXXXXXXXX"
contextLi = []
i=0
while i < 10:
re = requests.get(url)
re.encoding = 'utf8'
contextLi.append(re.text)
i += 1
print(i , " succeed")
time.sleep(2) #休息兩秒
time.sleep(random.randint(1,10)) #休息1~10秒之間
需要帳號登入的網站
import requests
url = 'XXXXXXXXXXXXXXXXXXXXXXXXXXX'
#自訂headers
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
#將自訂headers加入 GET 請求中
r = requests.get(url, headers = my_headers)
#需要帳號登入的網頁
r = requests.get(url, auth=('帳號', '密碼'))
#等待 3 秒無回應則放棄
requests.get('http://github.com/', timeout = 3)--------------------------------------------------------------------
使用Session方法登入網站session = requests.Session()
payload = {'username': 'Morvan', 'password': 'password'}
r = session.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())
r = session.get("http://pythonscraping.com/pages/cookies/profile.php")
print(r.text)
使用API
import requests
url = 'XXXXXXXXXXXXXXXXXXXXXXXXXXX'
token ='yourAPI_KEY'
r = requests.get(url, headers={"token":token})
被封鎖IP了嗎
from bs4 import BeautifulSoup
import requests
import randomif __name__ == '__main__':
# 代理伺服器查詢: http://cn-proxy.com/
proxy_ips = ['51.15.227.220:3128', '81.162.56.154:8081']
ip = random.choice(proxy_ips)
print('Use', ip)
resp = requests.get('http://ip.filefab.com/index.php',
proxies={'http': 'http://' + ip})
soup = BeautifulSoup(resp.text, 'html5lib')
print(soup.find('h1', id='ipd').text.strip())
BeautifulSoup
建立Soup
#建立爬取對象
soup = BeautifulSoup(resp.text, 'lxml')
#解析器:lxml(官方推薦,速度最快)#或者建立本地HTML對象
soup = BeautifulSoup(open('index.html'))# 從檔案讀取 HTML 程式碼進行解析
with open("index.html") as f:
soup = BeautifulSoup(f)
印出完整HTML
#輸出排版後的 HTML 程式碼
print(soup.prettify())
用soup.find() [只能找單一元素,可回傳成Text或Str屬性]
用soup.find_all() [找所有元素,回傳符合內容成1個List]
resp = requests.get('https://jwlin.github.io/py-scraping-analysis-book/ch1/connect.html')
soup = BeautifulSoup(resp.text, 'html.parser')#印出h1標籤
soup.find('h1').text #印出第一個h1標籤的文字
soup.find_all('h1')[0].get_text()) #印出所有h1標籤的文字
soup.find('h4',{'class':'pk'}) #印出h4標籤且class為pk的內容
soup.find(id='link2')
#find用.text顯示文字
#find用.get_text()顯示文字(資料型態須為字串格式)
#soup.find('h4')等同於soup.h4# 根據 id 搜尋
soup.find(id='link2')#印出段落p內文
soup.find('p').text#搜尋多個標籤
tags = soup.find_all(["title", "p"])#限制搜尋結果數量
tags = soup.find_all(["title", "p"], limit=2)# 搜尋h3標籤且class為 boldtext 的內容
soup.find_all("h3", class_="boldtext")
# 搜尋 class 為 outer-text的內容
soup.find_all(class_="outer-text")#class用class_表示的原因是因為class是Python保留字#輸出所有超連結URL
a_tags = soup.find_all('a')for tag in a_tags:
print(tag.get('href'))#以下寫法都相同功能
soup.find_all("h4", {"class": "card-title"})
soup.find_all("h4", class_ = "card-title")
soup.find_all("h4", "card-title")#.find().find_all()
find完element後,再針對element進行find_all()的動作
例如:soup.find("table").find_all("tr")
找到table標籤底下的所有tr標籤
注意不可寫成.find_all().find()#實用的for loop寫法,一口氣抓出所有內容文字
[tag.text for tag in soup.find_all("tag")]
select()方法 [id用#、class用.、沒用#或.則視為標籤] - CSS
#查所有的CSS
bs.select('h4') #選擇h4標籤
bs.select('#books') #選擇id為books
bs.select('#books') #選擇所有p標籤底下,且id為books的元素
bs.select('.PK') #選擇class為PK
bs.select('h4.bk') #選擇class為PK的h4標籤
bs.select('img') #所有img標籤bs.select("p.strikeout.body")
印標題和內文 — 直接選擇標籤Tag
#印出標題
print(soup.title)
#印出屬性
print(soup.title.name)
#印出數值
print(soup.title.string)
#印出head
print(soup.head)
#印出a標籤
print(soup.a)
#印出p標籤
print(soup.p)
#印出表格table
print(soup.table)
print(soup.find('table'))
#印出表格中的所有文字內容
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)#印出p標籤的文字
print(soup.p.string)
#印出網頁中的所有文字
soup.get_text()
#soup.a和soup.a.string的差別
print(soup.a) #第一行印出a標籤的「程式碼」
print(soup.a.string) #第二行印出a標籤中的「文字」
獲得屬性名稱的內容 .get()
for link in soup.find_all('a'):
print(link.get('href'))#result:
http://example.com/XXX1
http://example.com/XXX2
http://example.com/XXX3
取得文字.get_text()
print(soup.get_text())
#result:
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
使用正規表達式
常見的 rule
Email: [A-Za-z0-9\._+]+@[A-Za-z0-9\._]+\.(com|org|edu|net)
URL: http(s)?://[A-Za-z0-9\./_]+
所有中文字(不含標點符號): [\u4e00-\u9fa5]+
或者Google 關鍵字找規則--------------------------------------------------------------------import requests
import re
from bs4 import BeautifulSoupdef main():
resp = requests.get('http://jwlin.github.io/py-scraping-analysis-book/ch2/blog/blog.html')
soup = BeautifulSoup(resp.text, 'html.parser')# 找出所有 'h' 開頭的標題文字
titles = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
for title in titles:
print(title.text.strip())# 利用 regex 找出所有 'h' 開頭的標題文字
for title in soup.find_all(re.compile('h[1-6]')):
print(title.text.strip())# 找出所有 .png 結尾的圖片
imgs = soup.find_all('img')
for img in imgs:
if 'src' in img.attrs:
if img['src'].endswith('.png'):
print(img['src'])# 利用 regex 找出所有 .png 結尾的圖片
for img in soup.find_all('img', {'src': re.compile('\.png$')}):
print(img['src'])# 找出所有 .png 結尾且含 'beginner' 的圖片
imgs = soup.find_all('img')
for img in imgs:
if 'src' in img.attrs:
if 'beginner' in img['src'] and img['src'].endswith('.png'):
print(img['src'])# 利用 regex 找出所有 .png 結尾且含 'beginner' 的圖片
for img in soup.find_all('img', {'src': re.compile('beginner.*\.png$')}):
print(img['src'])if __name__ == '__main__':
main().................................................................#爬取所有jpg圖檔的網址
soup = BeautifulSoup(html, features='lxml')img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')})
for link in img_links:
print(link['src'])#爬取該網域開頭的所有連結
course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')})
for link in course_links:
print(link['href'])#可搭配urlretrieve下載爬到的圖片檔案from urllib.request import urlretrieve
urlretrieve(IMAGE_URL, './img/image1.png')
正規表達式範例
# 取出購物網站價格### html範例<ul class ="prdPrice">
<li>xxx<del>12,999</del></li)
<li class = "special">
折扣後價格<span>12,799</span>
.....### 找出元素位置後,透過正規表達式抓取數字txt = driver.find_element_by_xpath("//li[@class='special]").textpattern = r'(\d+\,)?(\d+)'
price = re.search(pattern, txt).group()
print('商品價格:', price)
將爬蟲結果保存成CSV
import pandas as pd
df_bs = pd.DataFrame(row_list,columns=['A','B','C'])
df_bs.set_index('A',inplace=True)
df_bs.to_csv('beautifulsoup.csv')
CSV檔案在Excel開啟時是亂碼
#步驟1:加入BOM
with open('fileName.csv', 'wb') as f:
f.write(b'\xEF\xBB\xBF') #在檔頭加入UTF-8編碼的BOM
#步驟2:繼續原本要做的動作
with open('fileName.csv', 'w', encoding='utf-8', newline='') as f:
#執行檔案操作.........
Dcard API操作範例
在Chrome瀏覽器環境下進入 Dcard首頁,點擊F12->Network->XHR
此時可以發現有多個不同的URL,我們找到posts開頭的URL點擊,並點選Preview,可以看到有30篇的Dcard熱門文章以Json格式保存
例如:
{read: true, newComment: false, id: 232931749, title: “#爆卦 #高調 武漢肺炎進到台灣了!!!”,…}
read: truenewComment: falseid: 232931749title: “#爆卦 #高調 武漢肺炎進到台灣了!!!”excerpt: “截至1月20號,共有八例疑似病例,其中六人已排除,另外兩名通報個案還在檢驗中,疾管署剛剛公布檢驗結果,一位呈現陽性反應,確認是冠狀病毒,台灣出現首例武漢肺炎,目前該病人已被隔離,疾管署也說口罩很充足,”.
.
.createdAt: “2020–01–21T10:27:05.857Z”updatedAt: “2020–01–21T10:29:20.438Z”commentCount: 3059likeCount: 12903topics: [“武漢”, “肺炎”, “台灣”, “淪陷”].forumName: “時事”forumAlias: “trending”.
.
.
gender: “M”school: “國立交通大學”.
.
.
以下省略
可以看到有相當多的資訊可以抓取,我們也可以複製該網址的URL:
https://www.dcard.tw/_api/posts?popular=true&limit=30
再搭配pandas和matplotlib來進行各種資料分析
除了在Dcard上可以使用這個方法來爬蟲之外,其他網站的資訊也可以透過檢查XHR的方法來抓取API。
實作
Youtube影片標題爬蟲
import requests
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
url = 'https://www.youtube.com/channel/UCFdTiwvDjyc62DBWrlYDtlQ/videos?sort=p&view=0&flow=grid'
resp = requests.get(url,headers=headers)
soup = BeautifulSoup(resp.text, 'lxml')target=soup.find_all('a')#存成txt檔案
txt = open('video-title.txt', 'w', encoding = 'UTF-8')
for i in target:
f=i.get_text().strip() #取得文字、去除左右的空格
txt.write(f) #寫入文字
txt.write('\n') #換行
txt.close() #關閉檔案
下載檔案
#建立資料夾
import os
os.makedirs('./img/', exist_ok=True)IMAGE_URL = "https://morvanzhou.github.io/static/img/description/learning_step_flowchart.png"#用urlretrieve下載檔案from urllib.request import urlretrieve
urlretrieve(IMAGE_URL, './img/image1.png')#用requests下載檔案(用於下載小檔案)
import requests
r = requests.get(IMAGE_URL)
with open('./img/image2.png', 'wb') as f:
f.write(r.content)#用requests進行chunk下載(用於下載大檔案)
r = requests.get(IMAGE_URL, stream=True) # stream loading
with open('./img/image3.png', 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
抓歌詞
import requests
from bs4 import BeautifulSoup
re = requests.get('https://mojim.com/twy108268x5x2.htm')
soup = BeautifulSoup(re.text, 'html.parser')
song = soup.find(id='fsZx3').text
print(song)
下載 Yahoo 頭條新聞
import requests
from bs4 import BeautifulSoup
# 下載 Yahoo 首頁內容
r = requests.get('https://tw.yahoo.com/')
# 確認是否下載成功
if r.status_code == requests.codes.ok:
# 以 BeautifulSoup 解析 HTML 程式碼
soup = BeautifulSoup(r.text, 'html.parser')
# 以 CSS 的 class 抓出各類頭條新聞
stories = soup.find_all('a', class_='story-title')
for s in stories:
# 新聞標題
print("標題:" + s.text)
# 新聞網址
print("網址:" + s.get('href'))
下載Google搜尋結果
import requests
from bs4 import BeautifulSoup
# Google 搜尋 URL
google_url = 'https://www.google.com.tw/search'
# 查詢參數
my_params = {'q': 'PM2.5'}
# 下載 Google 搜尋結果
r = requests.get(google_url, params = my_params)
# 確認是否下載成功
if r.status_code == requests.codes.ok:
# 以 BeautifulSoup 解析 HTML 原始碼
soup = BeautifulSoup(r.text, 'html.parser')
# 觀察 HTML 原始碼
# print(soup.prettify())
# 以 CSS 的選擇器來抓取 Google 的搜尋結果
items = soup.select('div.g > h3.r > a[href^="/url"]')
for i in items:
# 標題
print("標題:" + i.text)
# 網址
print("網址:" + i.get('href'))
Google股票資訊
import requests
from bs4 import BeautifulSoup# 網址後方加上 MARKET:STOCK_ID 即為個股資訊. e.g, TPE:2330
G_FINANCE_URL = 'https://www.google.com/search?q='def get_web_page(url, stock_id):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/66.0.3359.181 Safari/537.36'}
resp = requests.get(url + stock_id, headers=headers)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.textdef get_stock_info(dom):
soup = BeautifulSoup(dom, 'html5lib')
stock = dict()sections = soup.find_all('g-card-section')# 第 2 個 g-card-section, 取出公司名及即時股價資訊
stock['name'] = sections[1].div.text
spans = sections[1].find_all('div', recursive=False)[1].find_all('span', recursive=False)
stock['current_price'] = spans[0].text
stock['current_change'] = spans[1].text# 第 4 個 g-card-section, 有左右兩個 table 分別存放股票資訊
for table in sections[3].find_all('table'):
for tr in table.find_all('tr')[:3]:
key = tr.find_all('td')[0].text.lower().strip()
value = tr.find_all('td')[1].text.strip()
stock[key] = valuereturn stockif __name__ == '__main__':
page = get_web_page(G_FINANCE_URL, 'TPE:2330')
if page:
stock = get_stock_info(page)
for k, v in stock.items():
print(k, v)
蘋果日報及自由時報今日焦點
import requests
from bs4 import BeautifulSoupdef main():
print('蘋果今日焦點')
dom = requests.get('http://www.appledaily.com.tw/appledaily/hotdaily/headline').text
soup = BeautifulSoup(dom, 'html5lib')
for ele in soup.find('ul', 'all').find_all('li'):
print(
ele.find('div', 'aht_title_num').text,
ele.find('div', 'aht_title').text,
ele.find('div', 'aht_pv_num').text
)
print('-----------')
print('自由今日焦點')
dom = requests.get('http://news.ltn.com.tw/list/newspaper').text
soup = BeautifulSoup(dom, 'html5lib')
for ele in soup.find('ul', 'list').find_all('li'):
print(ele.find('a', 'tit').text.strip())if __name__ == '__main__':
main()
PTT表特版爬蟲
#講解如何繞過cookies的18歲限制
Chrome->F12->Network->Ctrl+R->index.html->Headers->cookie,
找到over18=1,將此內容放到如下地方
resp = requests.get(url=url,cookies={‘over18’: ‘1’})
import requests
import time
from bs4 import BeautifulSoup
import os
import re
import urllib.request
import jsonPTT_URL = 'https://www.ptt.cc'def get_web_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'} #繞過滿18歲限制
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.textdef get_articles(dom, date):
soup = BeautifulSoup(dom, 'html5lib')# 取得上一頁的連結
paging_div = soup.find('div', 'btn-group btn-group-paging')
prev_url = paging_div.find_all('a')[1]['href']articles = [] # 儲存取得的文章資料
divs = soup.find_all('div', 'r-ent')
for d in divs:
if d.find('div', 'date').text.strip() == date: # 發文日期正確
# 取得推文數
push_count = 0
push_str = d.find('div', 'nrec').text
if push_str:
try:
push_count = int(push_str) # 轉換字串為數字
except ValueError:
# 若轉換失敗,可能是'爆'或 'X1', 'X2', ...
# 若不是, 不做任何事,push_count 保持為 0
if push_str == '爆':
push_count = 99
elif push_str.startswith('X'):
push_count = -10# 取得文章連結及標題
if d.find('a'): # 有超連結,表示文章存在,未被刪除
href = d.find('a')['href']
title = d.find('a').text
author = d.find('div', 'author').text if d.find('div', 'author') else ''
articles.append({
'title': title,
'href': href,
'push_count': push_count,
'author': author
})
return articles, prev_urldef parse(dom):
soup = BeautifulSoup(dom, 'html.parser')
links = soup.find(id='main-content').find_all('a')
img_urls = []
for link in links:
if re.match(r'^https?://(i.)?(m.)?imgur.com', link['href']):
img_urls.append(link['href'])
return img_urlsdef save(img_urls, title):
if img_urls:
try:
dname = title.strip() # 用 strip() 去除字串前後的空白
os.makedirs(dname)
for img_url in img_urls:
# e.g. 'http://imgur.com/A2wmlqW.jpg'.split('//') -> ['http:', 'imgur.com/A2wmlqW.jpg']
if img_url.split('//')[1].startswith('m.'):
img_url = img_url.replace('//m.', '//i.')
if not img_url.split('//')[1].startswith('i.'):
img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1]
if not img_url.endswith('.jpg'):
img_url += '.jpg'
fname = img_url.split('/')[-1]
urllib.request.urlretrieve(img_url, os.path.join(dname, fname))
except Exception as e:
print(e)if __name__ == '__main__':
current_page = get_web_page(PTT_URL + '/bbs/Beauty/index.html')
if current_page:
articles = [] # 全部的今日文章
date = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
current_articles, prev_url = get_articles(current_page, date) # 目前頁面的今日文章
while current_articles: # 若目前頁面有今日文章則加入 articles,並回到上一頁繼續尋找是否有今日文章
articles += current_articles
current_page = get_web_page(PTT_URL + prev_url)
current_articles, prev_url = get_articles(current_page, date)# 已取得文章列表,開始進入各文章讀圖
for article in articles:
print('Processing', article)
page = get_web_page(PTT_URL + article['href'])
if page:
img_urls = parse(page)
save(img_urls, article['title'])
article['num_image'] = len(img_urls)# 儲存文章資訊
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(articles, f, indent=10, sort_keys=True, ensure_ascii=False)
PTT食物版標題抓取
import requests
from bs4 import BeautifulSoup
res = requests.get('https://www.ptt.cc/bbs/Food/')
soup = BeautifulSoup(res.text,'lxml')
for item in soup.select(".r-ent"):
print(item.select(".title")[0].text.strip())
Youtube熱門影片標題爬蟲,並存至txt當中
import requests
from bs4 import BeautifulSoupurl="https://www.youtube.com/feed/trending/"request=requests.get(url)
content=request.content
soup=BeautifulSoup(content,"html.parser")container = soup.select("h3 a")# 寫入result.txt檔案
file = open('result.txt','w')for item in container:
if item:
value = item.get_text()
print(value)
file.write(value+'\n')
file.close()#存成txt檔案
txt = open('video-title.txt', 'w', encoding = 'UTF-8')
for i in target:
f=i.get_text().strip() #取得文字、去除左右的空格
txt.write(f) #寫入文字
txt.write('\n') #換行
txt.close() #關閉檔案
電子書網頁標題翻頁爬蟲
import requests
from bs4 import BeautifulSoupfor i in range(1,50): #第1頁到第49頁
i=str(i) #轉成字串
url="http://www.allitebooks.org/page/"+i
print(url)
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
titles = soup.find_all('h2')
for t in titles:
print(t.string)