R語言學習筆記(四):tidyverse

介紹

向量計算優先於迴圈

tidyverse包含以下三個套件的功能magrittr:使用 %>% 運算子
tidyr:進行長寬表格的轉換
dplyr:更有效率地作資料處理
tibble為tidyverse的主要資料型態(如同dataframe一樣),但是比請內建的dataframe型態方便

安裝與使用

install.packages("tidyverse")
library(tidyverse)

tibble

as_tibble():將原本的dataframe轉成tibble
tibble():從無到有建立

%>% 運算子

#以下兩行程式功能一模一樣summary(cars)      # 傳統呼叫函數
cars %>% summary() # 使用 %>%

gather()寬轉長表格

#data.frame建立表格team_name <- c("Chicago Bulls", "Golden State Warriors")
wins <- c(72, 73)
losses <- c(10, 9)
great_nba_teams <- data.frame(team_name, wins, losses)
great_nba_teams
team_name wins losses
1 Chicago Bulls 72 10
2 Golden State Warriors 73 9
#新增欄位(直接$欄位名稱即可)great_nba_teams <- data.frame(team_name, wins, losses)
great_nba_teams$winning_percentage <- great_nba_teams$wins / (great_nba_teams$wins + great_nba_teams$losses)
great_nba_teams
team_name wins losses winning_percentage
1 Chicago Bulls 72 10 0.8780488
2 Golden State Warriors 73 9 0.8902439
#寬轉長表格gather(great_nba_teams, key = variable_names, value = values, wins, losses)
team_name variable_names values
1 Chicago Bulls wins 72
2 Golden State Warriors wins 73
3 Chicago Bulls losses 10
4 Golden State Warriors losses 9

spread()長轉寬表格

long_format <- gather(great_nba_teams, key = variable_names, value = values, wins, losses)spread(long_format, key = variable_names, value = values)

dplr套件

filter()篩選符合條件的觀測值(rows左到右)select()選擇變數(columns上到下)mutate()新增變數(columns上到下)arrange()依照指定變數排序觀測值(rows左到右)summarise()聚合變數group_by()依照類別變數分組搭配

filter()

直接加邏輯判斷式,篩選橫列

filter(mtcars, cyl == 8)filter(mtcars, cyl < 6)# Multiple criteriafilter(mtcars, cyl < 6 & vs == 1)   #ANDfilter(mtcars, cyl < 6 | vs == 1)   #OR# Multiple arguments are equivalent to andfilter(mtcars, cyl < 6, vs == 1)

slice()

使用index索引來選定特定橫列

diamonds %>% slice(1:5)

select()

starts_with() 挑選欄位名稱開頭有””字串的欄位ends_with() 挑選欄位名稱結尾有””字串的欄位contains() 挑選欄位名稱中包含””字串的欄位matches() 挑選欄位名稱符合””字串的欄位---------------------select(mtcars, starts_with("m"))select(mtcars,  ends_with("b"))select(mtcars, contains("ra"))select(mtcars, matches("a"))select(mtcars, disp, am)select(mtcars, -manufacturer, -fl) #沒有manufacturer和fl的欄位select(mtcars, -c(manufacturer, fl)) #沒有manufacturer和fl的欄位select(mtcars, -1, -7)  #從第1欄到第7欄都不包含在內

mutate()

將運算後的結果,產生新的欄位

> library(tidyverse)> great_nba_teams <- data.frame(team_name, wins, losses, stringsAsFactors = FALSE)
> mutate(great_nba_teams,
+ winning_percentage = wins / (wins + losses),
+ season = season
+ )
team_name wins losses winning_percentage season
1 Chicago Bulls 72 10 0.8780488 1995-96
2 Golden State Warriors 73 9 0.8902439 2015-16

arrange()

按照順序排

arrange(mtcars, cyl, disp) #小到大arrange(mtcars, desc(disp))#大到小

summarise()

library(tidyverse)

straw_hat_df <- data.frame(name, gender, age, stringsAsFactors = FALSE)
summarise(straw_hat_df, mean(age))
mean(age)
1 30.33333
summarise(mtcars, mean(disp))summarise(group_by(mtcars, cyl), mean(disp))summarise(group_by(mtcars, cyl), m = mean(disp), sd = sd(disp))

group_by()

library(tidyverse)

straw_hat_df <- data.frame(name, gender, age, stringsAsFactors = FALSE)
group_by(straw_hat_df, gender) %>%
+ summarise(mean(age)) %>%
+ as.data.frame()
gender mean(age)
1 女 25.00000
2 男 31.85714
-----------------
summarise(mtcars, mean(disp))
summarise(group_by(mtcars, cyl), mean(disp))summarise(group_by(mtcars, cyl), m = mean(disp), sd = sd(disp))

資料合併

#欄與列的合併
dplyr::bind_rows(A,B)
dplyr::bind_cols(A,B)
#聯集與交集
dplyr::intersect(A,B)
dplyr::union(A,B)
#差集
dplyr::setdiff(A,B) #有A但是不包含B
dplyr::setdiff(B,A) #有B但是不包含A

Written by

Machine Learning / Deep Learning / Python / Flutter cakeresume.com/yanwei-liu

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store