1 パッケージ読み込み

library(tidyverse)
library(palmerpenguins)
library(janitor)

1.1 パッケージ入ってない場合はインストールして読み込む場合(コードのみ表示)

if (!require("palmerpenguins")) {
   install.packages("palmerpenguins")
   library(palmerpenguins)
}

2 github連携

2.1 githubのレポジトリをローカルに作成して管理

  • github上でCode>cloneでurlをコピー
  • RStudioでNew Project>Version Control>Git>Repositry URL にurl貼り付けてCreate Project

3 ベクトル操作

3.1 作成 

seq(1,3, by = 1)
## [1] 1 2 3
seq(1,10, by = 2)
## [1] 1 3 5 7 9
rep(1, 3)
## [1] 1 1 1
rep(1:3, 3)
## [1] 1 2 3 1 2 3 1 2 3
rep(1:3, each = 3)
## [1] 1 1 1 2 2 2 3 3 3

3.2 要素を削除

  • dplyr::setdiff
abcd <- c("a", "b", "c", "d")
rmitem <- c("b", "c")

abcd |> 
  setdiff(rmitem)
## [1] "a" "d"
  • baseの操作
abcd [! abcd %in% rmitem]
## [1] "a" "d"

3.3 要素のソート

sort(c("c", "a", "b"))
## [1] "a" "b" "c"

4 正規表現

4.1 数字,文字

4.1.1 例の作成

num <- c("1", "11", "111", "l", "|", "one", "一")

4.1.2 数字

  • どれも同じ

一字

str_view(num, "[0-9]")
## [1] │ <1>
## [2] │ <1><1>
## [3] │ <1><1><1>
str_view(num, "[:digit:]")
## [1] │ <1>
## [2] │ <1><1>
## [3] │ <1><1><1>
str_view(num, "\\d")
## [1] │ <1>
## [2] │ <1><1>
## [3] │ <1><1><1>

三字

str_view(num, "[0-9][0-9][0-9]")
## [3] │ <111>

4.1.3 アルファベット

アルファベットのみ

str_view(num, "[a-z]")
## [4] │ <l>
## [6] │ <o><n><e>

日本語も

str_view(num, "[:alpha:]") # 日本語も入る
## [4] │ <l>
## [6] │ <o><n><e>
## [7] │ <一>

str_view_all

str_view_all(num, "[a-z]")
## [1] │ 1
## [2] │ 11
## [3] │ 111
## [4] │ <l>
## [5] │ |
## [6] │ <o><n><e>
## [7] │ 一

文字数指定

str_view(num, "[a-z][a-z][a-z]")
## [6] │ <one>
str_view(num, "[a-z]..")
## [6] │ <one>
str_view(num, "[a-z]{3}")
## [6] │ <one>

4.1.4 数字とアルファベット

一文字

str_view(num, "[0-9a-z]")
## [1] │ <1>
## [2] │ <1><1>
## [3] │ <1><1><1>
## [4] │ <l>
## [6] │ <o><n><e>

すべて

str_view_all(num, "[0-9a-z]")
## [1] │ <1>
## [2] │ <1><1>
## [3] │ <1><1><1>
## [4] │ <l>
## [5] │ |
## [6] │ <o><n><e>
## [7] │ 一

指定文字数

str_view(num, "...")
## [3] │ <111>
## [6] │ <one>

4.2 単語のつながり

4.2.1 例の作成

text <- c("test", "test_test", "test_test_test", "testtest", 
          "test_testtest", "testtest_test", "test_test_","teest_teest")

4.2.2 繰り返し

一回

str_view(text, "(....)_\\1")
## [2] │ <test_test>
## [3] │ <test_test>_test
## [5] │ <test_test>test
## [6] │ test<test_test>
## [7] │ <test_test>_
str_view(text, "(.+)_\\1")
## [2] │ <test_test>
## [3] │ <test_test>_test
## [5] │ <test_test>test
## [6] │ test<test_test>
## [7] │ <test_test>_
## [8] │ <teest_teest>
str_view(text, "(....)\\1")
## [4] │ <testtest>
## [5] │ test_<testtest>
## [6] │ <testtest>_test
str_view(text, "(....)\\1_\\1")
## [6] │ <testtest_test>

二回

str_view(text, "(....)_\\1_\\1")
## [3] │ <test_test_test>

4.3 区分する点を探す

  • 変数名でpivot_longerするときに

4.3.1 検討用データフレーム作成

mtcars_vsam <- 
  mtcars |> 
  summarise(across(c(vs, am),
                   list(
                     p = \(x) mean(x, na.rm = TRUE),
                     n = \(x) sum(x, na.rm = TRUE),
               total_n = \(x) sum(!is.na(x))
                   ))
            )

mtcars_vsam
##     vs_p vs_n vs_total_n    am_p am_n am_total_n
## 1 0.4375   14         32 0.40625   13         32
# purrr 1.0.0より前
# mtcars_vsam <- 
#   mtcars |> 
#   summarise(across(c(vs, am),
#                    list(
#                      p = ~mean(., na.rm = TRUE),
#                      n = ~sum(., na.rm = TRUE),
#                total_n = ~sum(!is.na(.))
#                    ))
#             )

4.3.1.1 meanとsdを列名にするpivot_longer(×な例)

  • total_nも列名にもってきたい
mtcars_vsam |> 
  pivot_longer(everything(),
               names_to = c("items", ".value"),
               names_pattern = "(.*)_(.*)")
## # A tibble: 4 × 3
##   items         p     n
##   <chr>     <dbl> <dbl>
## 1 vs        0.438    14
## 2 vs_total NA        32
## 3 am        0.406    13
## 4 am_total NA        32

4.3.2 変数名の文字の長さ確認

# 降順に並び替えて1行目を取得
max_value <- 
names(mtcars_vsam) |>
  str_count() |>
  as_tibble() |>
  arrange(desc(value))

max_value
## # A tibble: 6 × 1
##   value
##   <int>
## 1    10
## 2    10
## 3     4
## 4     4
## 5     4
## 6     4
# 文字数の最大値取得
max_value <- 
  max_value |> 
  slice(1)

# total_pの前までで切る情報を追加したいため、最大値-total_pの文字数計算

max_value - str_count("total_p")
##   value
## 1     3

4.3.2.1 meanとsdを列名にするpivot_longer(okな例)

mtcars_vsam |> 
  pivot_longer(everything(),
               names_to = c("items", ".value"),
               names_pattern = "(.{1,2})_(.+)") # _の前は1~2字までに限定
## # A tibble: 2 × 4
##   items     p     n total_n
##   <chr> <dbl> <dbl>   <int>
## 1 vs    0.438    14      32
## 2 am    0.406    13      32
4.3.2.1.1 {n}の挙動

https://r4ds.had.co.nz/strings.html#repetition

文字
str_view("xxxxx", ".{1}")
## [1] │ <x><x><x><x><x>
str_view("xxxxx", "x{3}")
## [1] │ <xxx>xx
x <- c("xx_xxx","x_xx_xxx_xxxx_xxxxx")

str_view(x, "x{2,4}")
## [1] │ <xx>_<xxx>
## [2] │ x_<xx>_<xxx>_<xxxx>_<xxxx>x
str_view_all(x, "x{2,4}")
## [1] │ <xx>_<xxx>
## [2] │ x_<xx>_<xxx>_<xxxx>_<xxxx>x
数字
numbers <- 
  c("123", "123456", "123456789")

str_view(numbers, ".{4,}")
## [2] │ <123456>
## [3] │ <123456789>
# str_view(numbers, ".{,9}")

str_view(numbers, ".{4,7}")
## [2] │ <123456>
## [3] │ <1234567>89
str_view(numbers, ".{4,7}?") # lazy
## [2] │ <1234>56
## [3] │ <1234><5678>9

4.4 連続量を2値化して割合を出して1つのデータフレームに

4.4.1 2値変数作成

df_peng <-
  penguins |>
  mutate(across(ends_with("mm"),
                list(m = \(x) if_else(x > mean(x, na.rm = TRUE),1,0)))
         )

4.4.2 割合とnとtotal_nを算出

df_peng_res <- 
df_peng |>
  summarise(across(ends_with("_m"),
                   list( p    = \(x) mean(x, na.rm = TRUE),
                         n    = \(x) sum(x, na.rm = TRUE),           # bill_length_mm_mが1のn
                         total_n    = \(x) sum(!is.na(x))))  # bill_length_mm_mのna以外のn(total)
  )


# 変数名の文字の長さ確認
# 降順に並び替えて1行目を取得
max_value <- 
names(df_peng_res) |>
  str_count() |>
  as_tibble() |>
  arrange(desc(value))

max_value
## # A tibble: 9 × 1
##   value
##   <int>
## 1    27
## 2    24
## 3    23
## 4    21
## 5    21
## 6    18
## 7    18
## 8    17
## 9    17
# 文字数の最大値取得
max_value <- 
  max_value |> 
  slice(1)

# total_pの前までで切る情報を追加したいため、最大値-total_pの文字数計算

max_value - str_count("total_p")
##   value
## 1    20

4.4.3 meanとsdを列名にするpivot_longer

df_peng_res |> 
pivot_longer(everything(),
               names_to = c("items", ".value"),
               names_pattern = "(.{1,20})_(.+)") # _の前は1~20字までに限定
## # A tibble: 3 × 4
##   items                   p     n total_n
##   <chr>               <dbl> <dbl>   <int>
## 1 bill_length_mm_m    0.512   175     342
## 2 bill_depth_mm_m     0.532   182     342
## 3 flipper_length_mm_m 0.433   148     342

5 度数分布とクロス表

5.1 度数分布

df <- tribble(~moji,
              "a",
              NA,
              "c",
              "a",
              "c",
              "a")

5.1.1 文字型でデフォルト

df |> count(moji)
## # A tibble: 3 × 2
##   moji      n
##   <chr> <int>
## 1 a         3
## 2 c         2
## 3 <NA>      1
df |> tabyl(moji)
##  moji n   percent valid_percent
##     a 3 0.5000000           0.6
##     c 2 0.3333333           0.4
##  <NA> 1 0.1666667            NA

5.1.1.1 NA外して

df |> 
  tabyl(moji, show_na = FALSE)
##  moji n percent
##     a 3     0.6
##     c 2     0.4

5.1.1.2 countで%

df |> 
  drop_na(moji) |> 
  count(moji) |> 
  mutate(percent = n/sum(n))
## # A tibble: 2 × 3
##   moji      n percent
##   <chr> <int>   <dbl>
## 1 a         3     0.6
## 2 c         2     0.4

5.1.2 因子型

  • 本当はカテゴリ”b”があるがデータとしてないことを示す
df <- 
  df |> 
  mutate(moji = factor(moji, levels = c("a", "b", "c")))

df |> count(moji)
## # A tibble: 3 × 2
##   moji      n
##   <fct> <int>
## 1 a         3
## 2 c         2
## 3 <NA>      1
df |> tabyl(moji)
##  moji n   percent valid_percent
##     a 3 0.5000000           0.6
##     b 0 0.0000000           0.0
##     c 2 0.3333333           0.4
##  <NA> 1 0.1666667            NA

5.1.2.1 countでデータのない水準もn=0で示す

df |> count(moji, .drop = FALSE)
## # A tibble: 4 × 2
##   moji      n
##   <fct> <int>
## 1 a         3
## 2 b         0
## 3 c         2
## 4 <NA>      1

5.1.2.2 tabylでデータのない水準もn=0で示してNAは計算に入れずtotalも示す

df |> 
  tabyl(moji, show_na = FALSE) |> 
  adorn_totals()
##   moji n percent
##      a 3     0.6
##      b 0     0.0
##      c 2     0.4
##  Total 5     1.0

5.1.3 数字

df <- tibble(suji = c(1,1,0,0,0,NA))

df |> tabyl(suji)
##  suji n   percent valid_percent
##     0 3 0.5000000           0.6
##     1 2 0.3333333           0.4
##    NA 1 0.1666667            NA
df |> 
  summarise(percent = mean(suji, na.rm = TRUE),
            n = sum(suji, na.rm = TRUE),
            total_n = sum(!is.na(suji)))
## # A tibble: 1 × 3
##   percent     n total_n
##     <dbl> <dbl>   <int>
## 1     0.4     2       5

5.2 クロス表

5.2.1 count

mtcars |> 
  count(gear, vs)
##   gear vs  n
## 1    3  0 12
## 2    3  1  3
## 3    4  0  2
## 4    4  1 10
## 5    5  0  4
## 6    5  1  1

5.2.1.1 割合追加

  • gearの水準別のvsの割合(%)
mtcars |> 
  count(gear, vs) |> 
  group_by(gear) |> 
  mutate(perc = n/sum(n)*100)
## # A tibble: 6 × 4
## # Groups:   gear [3]
##    gear    vs     n  perc
##   <dbl> <dbl> <int> <dbl>
## 1     3     0    12  80  
## 2     3     1     3  20  
## 3     4     0     2  16.7
## 4     4     1    10  83.3
## 5     5     0     4  80  
## 6     5     1     1  20

5.2.2 欠損ある場合

penguins |> 
  count(species, sex)
## # A tibble: 8 × 3
##   species   sex        n
##   <fct>     <fct>  <int>
## 1 Adelie    female    73
## 2 Adelie    male      73
## 3 Adelie    <NA>       6
## 4 Chinstrap female    34
## 5 Chinstrap male      34
## 6 Gentoo    female    58
## 7 Gentoo    male      61
## 8 Gentoo    <NA>       5

5.2.2.1 割合追加

5.2.2.1.1 欠損あり
penguins |> 
  count(species, sex) |>
  group_by(species) |>
  mutate(perc = n/sum(n)*100)
## # A tibble: 8 × 4
## # Groups:   species [3]
##   species   sex        n  perc
##   <fct>     <fct>  <int> <dbl>
## 1 Adelie    female    73 48.0 
## 2 Adelie    male      73 48.0 
## 3 Adelie    <NA>       6  3.95
## 4 Chinstrap female    34 50   
## 5 Chinstrap male      34 50   
## 6 Gentoo    female    58 46.8 
## 7 Gentoo    male      61 49.2 
## 8 Gentoo    <NA>       5  4.03
5.2.2.1.2 欠損drop
penguins |> 
  drop_na(species, sex) |> 
  count(species, sex) |>
  group_by(species) |>
  mutate(perc = n/sum(n)*100)
## # A tibble: 6 × 4
## # Groups:   species [3]
##   species   sex        n  perc
##   <fct>     <fct>  <int> <dbl>
## 1 Adelie    female    73  50  
## 2 Adelie    male      73  50  
## 3 Chinstrap female    34  50  
## 4 Chinstrap male      34  50  
## 5 Gentoo    female    58  48.7
## 6 Gentoo    male      61  51.3

5.2.3 tabyl

mtcars |> 
  tabyl(gear, vs)
##  gear  0  1
##     3 12  3
##     4  2 10
##     5  4  1

5.2.3.1 割合

mtcars |> 
  tabyl(gear, vs) |> 
  adorn_percentages()
##  gear         0         1
##     3 0.8000000 0.2000000
##     4 0.1666667 0.8333333
##     5 0.8000000 0.2000000
割合とn
mtcars |> 
  tabyl(gear, vs) |> 
  adorn_percentages() |> 
  adorn_ns()
##  gear                      0                      1
##     3               0.8 (12)               0.2  (3)
##     4 0.166666666666667  (2) 0.833333333333333 (10)
##     5               0.8  (4)               0.2  (1)
割合(%)とn
mtcars |> 
  tabyl(gear, vs) |> 
  adorn_percentages() |> 
  adorn_pct_formatting() |> 
  adorn_ns()
##  gear          0          1
##     3 80.0% (12) 20.0%  (3)
##     4 16.7%  (2) 83.3% (10)
##     5 80.0%  (4) 20.0%  (1)
nと割合(%)
mtcars |> 
  tabyl(gear, vs) |> 
  adorn_percentages() |> 
  adorn_pct_formatting() |> 
  adorn_ns(position = "front")
##  gear          0          1
##     3 12 (80.0%)  3 (20.0%)
##     4  2 (16.7%) 10 (83.3%)
##     5  4 (80.0%)  1 (20.0%)

5.2.4 3要因以上

penguins |> 
  count(island, species, sex) |>
  group_by(island, species) |>
  mutate(perc = n/sum(n)*100)
## # A tibble: 13 × 5
## # Groups:   island, species [5]
##    island    species   sex        n  perc
##    <fct>     <fct>     <fct>  <int> <dbl>
##  1 Biscoe    Adelie    female    22 50   
##  2 Biscoe    Adelie    male      22 50   
##  3 Biscoe    Gentoo    female    58 46.8 
##  4 Biscoe    Gentoo    male      61 49.2 
##  5 Biscoe    Gentoo    <NA>       5  4.03
##  6 Dream     Adelie    female    27 48.2 
##  7 Dream     Adelie    male      28 50   
##  8 Dream     Adelie    <NA>       1  1.79
##  9 Dream     Chinstrap female    34 50   
## 10 Dream     Chinstrap male      34 50   
## 11 Torgersen Adelie    female    24 46.2 
## 12 Torgersen Adelie    male      23 44.2 
## 13 Torgersen Adelie    <NA>       5  9.62
# mtcars |> 
#   count(gear, vs, am) |> 
#   group_by(gear, vs) |> 
#   mutate(perc = n/sum(n)*100)

5.3 modelsummary::datasummary()

modelsummary::datasummary_crosstab(species ~ sex, data = penguins)
tinytable_u4x6x0irk2f6hksmwrr6
species female male All
Adelie N 73 73 152
% row 48.0 48.0 100.0
Chinstrap N 34 34 68
% row 50.0 50.0 100.0
Gentoo N 58 61 124
% row 46.8 49.2 100.0
All N 165 168 344
% row 48.0 48.8 100.0

6 関数

6.1 基本

test <- function(x){
  x + 1
  }

test(1)
## [1] 2
test_xy <- function(x,y){
  x + y
}

test_xy(1, 2)
## [1] 3
  • 式が1行だけなら{}不要
test_simple <- function(x) x + 1

test_simple(10)
## [1] 11

6.1.1 省略形

test_s <- \(x){
  x + 1
  }

test_s(100)
## [1] 101
test_xy_s <- \(x,y){
  x + y
}

test_xy_s(2,3)
## [1] 5

6.1.2 関数内で数値または文字

test_num <- function(x){
  print(x)
} 

test_num(1)
## [1] 1
test_chr <- function(x){
  x2 <- deparse(substitute(x))
    print(x2)
} 

test_chr(1)
## [1] "1"

6.2 無名関数 anonymous function

(function(x) x + 1)(1)
## [1] 2

6.2.1 無名関数の省略形(R4.1以降)

(\(x) x + 1)(1)
## [1] 2
(\(x, y) x + y)(2,3)
## [1] 5

6.2.1.1 \(バックスラッシュ)の表示について

  • ブラウザでHTML開いたときに円マーク¥(本当は半角)になる場合は,ブラウザのフォント設定で固定幅フォントを英語のものに変えると解決

  • 具体的には,Chromeで三点リーダ > 設定 > デザイン > フォントをカスタマイズ > 固定幅フォントがデフォルトでMS Gothicになっている場合,ArialとかCourier Newに変える

6.3 関数の中身確認

select
## function (.data, ...) 
## {
##     UseMethod("select")
## }
## <bytecode: 0x0000021e49b55b70>
## <environment: namespace:dplyr>
  • methods()で調べるといくつか候補が出る
methods(select)
## [1] select.data.frame* select.list       
## see '?methods' for accessing help and source code
  • 元のパッケージに:::(注:コロン3つ)をつけてmethodsで調べた対象を入力して実行するとコードが確認できる
dplyr:::select.data.frame
## function (.data, ...) 
## {
##     error_call <- dplyr_error_call()
##     loc <- tidyselect::eval_select(expr(c(...)), data = .data, 
##         error_call = error_call)
##     loc <- ensure_group_vars(loc, .data, notify = TRUE)
##     out <- dplyr_col_select(.data, loc)
##     out <- set_names(out, names(loc))
##     out
## }
## <bytecode: 0x0000021e529122b0>
## <environment: namespace:dplyr>
  • または関数を使うとパッケージ名なしでもできる
getAnywhere(select.data.frame)
## A single object matching 'select.data.frame' was found
## It was found in the following places
##   registered S3 method for select from namespace dplyr
##   namespace:dplyr
## with value
## 
## function (.data, ...) 
## {
##     error_call <- dplyr_error_call()
##     loc <- tidyselect::eval_select(expr(c(...)), data = .data, 
##         error_call = error_call)
##     loc <- ensure_group_vars(loc, .data, notify = TRUE)
##     out <- dplyr_col_select(.data, loc)
##     out <- set_names(out, names(loc))
##     out
## }
## <bytecode: 0x0000021e529122b0>
## <environment: namespace:dplyr>

6.4 平均値算出

6.4.1 平均値とn

mean_n <- 
  function(data, variable){
    data |> 
      summarise(across({{variable}},
                       list(mean = \(x) mean(x, na.rm = TRUE),
                            n    = \(x) sum(!is.na(x))))
                )
  }

mean_n(penguins, where(is.numeric))
## # A tibble: 1 × 10
##   bill_length_mm_mean bill_length_mm_n bill_depth_mm_mean bill_depth_mm_n
##                 <dbl>            <int>              <dbl>           <int>
## 1                43.9              342               17.2             342
## # ℹ 6 more variables: flipper_length_mm_mean <dbl>, flipper_length_mm_n <int>,
## #   body_mass_g_mean <dbl>, body_mass_g_n <int>, year_mean <dbl>, year_n <int>

6.4.1.1

mean_ns <- 
  \(data, variable){
    data |> 
      summarise(across({{variable}},
                       list(mean = \(x) mean(x, na.rm = TRUE),
                            n    = \(x) sum(!is.na(x))))
                )
  }

mean_ns(penguins, where(is.numeric))
## # A tibble: 1 × 10
##   bill_length_mm_mean bill_length_mm_n bill_depth_mm_mean bill_depth_mm_n
##                 <dbl>            <int>              <dbl>           <int>
## 1                43.9              342               17.2             342
## # ℹ 6 more variables: flipper_length_mm_mean <dbl>, flipper_length_mm_n <int>,
## #   body_mass_g_mean <dbl>, body_mass_g_n <int>, year_mean <dbl>, year_n <int>

6.4.1.2 確認

penguins |> 
      summarise(across(where(is.numeric),
                       list(mean = \(x) mean(x, na.rm = TRUE),
                            n    = \(x) sum(!is.na(x))))
                )
## # A tibble: 1 × 10
##   bill_length_mm_mean bill_length_mm_n bill_depth_mm_mean bill_depth_mm_n
##                 <dbl>            <int>              <dbl>           <int>
## 1                43.9              342               17.2             342
## # ℹ 6 more variables: flipper_length_mm_mean <dbl>, flipper_length_mm_n <int>,
## #   body_mass_g_mean <dbl>, body_mass_g_n <int>, year_mean <dbl>, year_n <int>

6.4.2 平均値とnで列をmeanとnに

cmean_n <- 
  function(data, variable){
    data |> 
      summarise(across({{variable}},
                       list(mean = \(x) mean(x, na.rm = TRUE),
                            n    = \(x) sum(!is.na(x))))
      ) |> 
      pivot_longer(everything(),
                   names_to = c("variables", ".value"), # ".value"の部分を列名に
                   names_pattern = "(.*)_(.*)")
  }

cmean_n(penguins, where(is.numeric))
## # A tibble: 5 × 3
##   variables           mean     n
##   <chr>              <dbl> <int>
## 1 bill_length_mm      43.9   342
## 2 bill_depth_mm       17.2   342
## 3 flipper_length_mm  201.    342
## 4 body_mass_g       4202.    342
## 5 year              2008.    344

6.4.2.1 確認

penguins |> 
      summarise(across(where(is.numeric),
                       list(mean = \(x) mean(x, na.rm = TRUE),
                            n    = \(x) sum(!is.na(x))))
      ) |> 
      pivot_longer(everything(),
                   names_to = c("variables", ".value"), # ".value"の部分を列名に
                   names_pattern = "(.*)_(.*)")
## # A tibble: 5 × 3
##   variables           mean     n
##   <chr>              <dbl> <int>
## 1 bill_length_mm      43.9   342
## 2 bill_depth_mm       17.2   342
## 3 flipper_length_mm  201.    342
## 4 body_mass_g       4202.    342
## 5 year              2008.    344

6.4.3 totalとgroupの平均値とnを一気に算出

all_group = function(data, outcome_vars, group_vars = NULL) {
  data |> 
    group_by(pick({{group_vars}})) |> 
    summarise(across({{outcome_vars}}, 
                     list(mean = \(x) mean(x, na.rm = TRUE),
                          n    = \(x) sum(!is.na(x)))))
}

all_group(penguins, bill_length_mm)
## # A tibble: 1 × 2
##   bill_length_mm_mean bill_length_mm_n
##                 <dbl>            <int>
## 1                43.9              342
all_group(penguins, bill_length_mm, species)
## # A tibble: 3 × 3
##   species   bill_length_mm_mean bill_length_mm_n
##   <fct>                   <dbl>            <int>
## 1 Adelie                   38.8              151
## 2 Chinstrap                48.8               68
## 3 Gentoo                   47.5              123
# dplyr 1.1.0より前
#    group_by(across({{group_vars}})) |> 

6.4.3.1 確認

penguins |> 
  summarise(across(bill_length_mm,
                   list(mean = \(x) mean(x, na.rm = TRUE),
                          n    = \(x) sum(!is.na(x)))))
## # A tibble: 1 × 2
##   bill_length_mm_mean bill_length_mm_n
##                 <dbl>            <int>
## 1                43.9              342
penguins |> 
  group_by(species) |> 
  summarise(across(bill_length_mm,
                   list(mean = \(x) mean(x, na.rm = TRUE),
                          n    = \(x) sum(!is.na(x)))))
## # A tibble: 3 × 3
##   species   bill_length_mm_mean bill_length_mm_n
##   <fct>                   <dbl>            <int>
## 1 Adelie                   38.8              151
## 2 Chinstrap                48.8               68
## 3 Gentoo                   47.5              123

6.4.3.2 mapでリストに結果を格納

quos(NULL, species) |> 
  map( \(x) all_group(penguins, bill_length_mm, !!x))
## [[1]]
## # A tibble: 1 × 2
##   bill_length_mm_mean bill_length_mm_n
##                 <dbl>            <int>
## 1                43.9              342
## 
## [[2]]
## # A tibble: 3 × 3
##   species   bill_length_mm_mean bill_length_mm_n
##   <fct>                   <dbl>            <int>
## 1 Adelie                   38.8              151
## 2 Chinstrap                48.8               68
## 3 Gentoo                   47.5              123
quos(NULL, species, island, c(species, island)) |> 
  map( \(x) all_group(penguins, bill_length_mm, !!x))
## [[1]]
## # A tibble: 1 × 2
##   bill_length_mm_mean bill_length_mm_n
##                 <dbl>            <int>
## 1                43.9              342
## 
## [[2]]
## # A tibble: 3 × 3
##   species   bill_length_mm_mean bill_length_mm_n
##   <fct>                   <dbl>            <int>
## 1 Adelie                   38.8              151
## 2 Chinstrap                48.8               68
## 3 Gentoo                   47.5              123
## 
## [[3]]
## # A tibble: 3 × 3
##   island    bill_length_mm_mean bill_length_mm_n
##   <fct>                   <dbl>            <int>
## 1 Biscoe                   45.3              167
## 2 Dream                    44.2              124
## 3 Torgersen                39.0               51
## 
## [[4]]
## # A tibble: 5 × 4
## # Groups:   species [3]
##   species   island    bill_length_mm_mean bill_length_mm_n
##   <fct>     <fct>                   <dbl>            <int>
## 1 Adelie    Biscoe                   39.0               44
## 2 Adelie    Dream                    38.5               56
## 3 Adelie    Torgersen                39.0               51
## 4 Chinstrap Dream                    48.8               68
## 5 Gentoo    Biscoe                   47.5              123

6.4.3.3 1つのデータフレームに

quos(NULL, species) |> 
  map( \(x) all_group(penguins, bill_length_mm, !!x)) |> 
  bind_rows()
## # A tibble: 4 × 3
##   bill_length_mm_mean bill_length_mm_n species  
##                 <dbl>            <int> <fct>    
## 1                43.9              342 <NA>     
## 2                38.8              151 Adelie   
## 3                48.8               68 Chinstrap
## 4                47.5              123 Gentoo
# こちらでもOK
# quos(NULL, species) |> 
#   map_dfr( \(x) all_group(penguins, bill_length_mm, !!x))
quos(NULL, species, island, c(species, island)) |> 
  map(\(x) all_group(penguins, bill_length_mm, !!x)) |> 
  bind_rows()
## # A tibble: 12 × 4
##    bill_length_mm_mean bill_length_mm_n species   island   
##                  <dbl>            <int> <fct>     <fct>    
##  1                43.9              342 <NA>      <NA>     
##  2                38.8              151 Adelie    <NA>     
##  3                48.8               68 Chinstrap <NA>     
##  4                47.5              123 Gentoo    <NA>     
##  5                45.3              167 <NA>      Biscoe   
##  6                44.2              124 <NA>      Dream    
##  7                39.0               51 <NA>      Torgersen
##  8                39.0               44 Adelie    Biscoe   
##  9                38.5               56 Adelie    Dream    
## 10                39.0               51 Adelie    Torgersen
## 11                48.8               68 Chinstrap Dream    
## 12                47.5              123 Gentoo    Biscoe
# こちらでもOK
# quos(NULL, species, island, c(species, island)) |> 
#   map_dfr( \(x) all_group(penguins, bill_length_mm, !!x))

6.5 クロス表

6.5.1 複数層別と単一のアウトカム

nest_tabyl = function(df, outcome_var, nest_vars = NULL){
df |>
  nest(data = !c({{nest_vars}})) |>
    mutate(freq =
           map(data, \(x) tabyl(x, {{outcome_var}}))) |> # show_na = FALSEで欠損無しのpercentのみ算出
  select(!data) |>
  unnest(cols = c(freq)) |>
  as_tibble()
}

# island x species
nest_tabyl(penguins, species, island)
## # A tibble: 9 × 4
##   island    species       n percent
##   <fct>     <fct>     <int>   <dbl>
## 1 Torgersen Adelie       52   1    
## 2 Torgersen Chinstrap     0   0    
## 3 Torgersen Gentoo        0   0    
## 4 Biscoe    Adelie       44   0.262
## 5 Biscoe    Chinstrap     0   0    
## 6 Biscoe    Gentoo      124   0.738
## 7 Dream     Adelie       56   0.452
## 8 Dream     Chinstrap    68   0.548
## 9 Dream     Gentoo        0   0
# speciesのみ
nest_tabyl(penguins, species)
## # A tibble: 3 × 3
##   species       n percent
##   <fct>     <int>   <dbl>
## 1 Adelie      152   0.442
## 2 Chinstrap    68   0.198
## 3 Gentoo      124   0.360
# 動かない
# nest_tabyl(penguins, c(species, island))

6.5.1.1 複数層別

# 層:全体,island
quos(NULL, island) |> 
  map( \(x) nest_tabyl(penguins, species, !!x))
## [[1]]
## # A tibble: 3 × 3
##   species       n percent
##   <fct>     <int>   <dbl>
## 1 Adelie      152   0.442
## 2 Chinstrap    68   0.198
## 3 Gentoo      124   0.360
## 
## [[2]]
## # A tibble: 9 × 4
##   island    species       n percent
##   <fct>     <fct>     <int>   <dbl>
## 1 Torgersen Adelie       52   1    
## 2 Torgersen Chinstrap     0   0    
## 3 Torgersen Gentoo        0   0    
## 4 Biscoe    Adelie       44   0.262
## 5 Biscoe    Chinstrap     0   0    
## 6 Biscoe    Gentoo      124   0.738
## 7 Dream     Adelie       56   0.452
## 8 Dream     Chinstrap    68   0.548
## 9 Dream     Gentoo        0   0
# 層:全体,island, sex, island x sex
quos(NULL, island, sex, c(island, sex)) |> 
  map( \(x) nest_tabyl(penguins, species, !!x))
## [[1]]
## # A tibble: 3 × 3
##   species       n percent
##   <fct>     <int>   <dbl>
## 1 Adelie      152   0.442
## 2 Chinstrap    68   0.198
## 3 Gentoo      124   0.360
## 
## [[2]]
## # A tibble: 9 × 4
##   island    species       n percent
##   <fct>     <fct>     <int>   <dbl>
## 1 Torgersen Adelie       52   1    
## 2 Torgersen Chinstrap     0   0    
## 3 Torgersen Gentoo        0   0    
## 4 Biscoe    Adelie       44   0.262
## 5 Biscoe    Chinstrap     0   0    
## 6 Biscoe    Gentoo      124   0.738
## 7 Dream     Adelie       56   0.452
## 8 Dream     Chinstrap    68   0.548
## 9 Dream     Gentoo        0   0    
## 
## [[3]]
## # A tibble: 9 × 4
##   sex    species       n percent
##   <fct>  <fct>     <int>   <dbl>
## 1 male   Adelie       73   0.435
## 2 male   Chinstrap    34   0.202
## 3 male   Gentoo       61   0.363
## 4 female Adelie       73   0.442
## 5 female Chinstrap    34   0.206
## 6 female Gentoo       58   0.352
## 7 <NA>   Adelie        6   0.545
## 8 <NA>   Chinstrap     0   0    
## 9 <NA>   Gentoo        5   0.455
## 
## [[4]]
## # A tibble: 27 × 5
##    island    sex    species       n percent
##    <fct>     <fct>  <fct>     <int>   <dbl>
##  1 Torgersen male   Adelie       23   1    
##  2 Torgersen male   Chinstrap     0   0    
##  3 Torgersen male   Gentoo        0   0    
##  4 Torgersen female Adelie       24   1    
##  5 Torgersen female Chinstrap     0   0    
##  6 Torgersen female Gentoo        0   0    
##  7 Torgersen <NA>   Adelie        5   1    
##  8 Torgersen <NA>   Chinstrap     0   0    
##  9 Torgersen <NA>   Gentoo        0   0    
## 10 Biscoe    female Adelie       22   0.275
## # ℹ 17 more rows

6.5.1.2 確認

penguins |>
  nest(data = !c(island)) |>
    mutate(freq =
           map(data, \(x) tabyl(x, species))) |>
  select(!data) |>
  unnest(cols = c(freq)) |>
  as_tibble()
## # A tibble: 9 × 4
##   island    species       n percent
##   <fct>     <fct>     <int>   <dbl>
## 1 Torgersen Adelie       52   1    
## 2 Torgersen Chinstrap     0   0    
## 3 Torgersen Gentoo        0   0    
## 4 Biscoe    Adelie       44   0.262
## 5 Biscoe    Chinstrap     0   0    
## 6 Biscoe    Gentoo      124   0.738
## 7 Dream     Adelie       56   0.452
## 8 Dream     Chinstrap    68   0.548
## 9 Dream     Gentoo        0   0

6.5.2 固定層別と複数のアウトカム

# アウトカム:species, island;層:なし
quos(species, island) |> 
  map( \(x) nest_tabyl(penguins, !!x))
## [[1]]
## # A tibble: 3 × 3
##   species       n percent
##   <fct>     <int>   <dbl>
## 1 Adelie      152   0.442
## 2 Chinstrap    68   0.198
## 3 Gentoo      124   0.360
## 
## [[2]]
## # A tibble: 3 × 3
##   island        n percent
##   <fct>     <int>   <dbl>
## 1 Biscoe      168   0.488
## 2 Dream       124   0.360
## 3 Torgersen    52   0.151
# 文字でもOK
vars <- c("species", "island")

vars |> 
  map( \(x) nest_tabyl(penguins, x)) # !!.xでも動いた
## [[1]]
## # A tibble: 3 × 3
##   species       n percent
##   <fct>     <int>   <dbl>
## 1 Adelie      152   0.442
## 2 Chinstrap    68   0.198
## 3 Gentoo      124   0.360
## 
## [[2]]
## # A tibble: 3 × 3
##   island        n percent
##   <fct>     <int>   <dbl>
## 1 Biscoe      168   0.488
## 2 Dream       124   0.360
## 3 Torgersen    52   0.151
# アウトカム:species, island;層:sex
quos(species, island) |> 
  map( \(x) nest_tabyl(penguins, !!x, sex))
## [[1]]
## # A tibble: 9 × 4
##   sex    species       n percent
##   <fct>  <fct>     <int>   <dbl>
## 1 male   Adelie       73   0.435
## 2 male   Chinstrap    34   0.202
## 3 male   Gentoo       61   0.363
## 4 female Adelie       73   0.442
## 5 female Chinstrap    34   0.206
## 6 female Gentoo       58   0.352
## 7 <NA>   Adelie        6   0.545
## 8 <NA>   Chinstrap     0   0    
## 9 <NA>   Gentoo        5   0.455
## 
## [[2]]
## # A tibble: 9 × 4
##   sex    island        n percent
##   <fct>  <fct>     <int>   <dbl>
## 1 male   Biscoe       83  0.494 
## 2 male   Dream        62  0.369 
## 3 male   Torgersen    23  0.137 
## 4 female Biscoe       80  0.485 
## 5 female Dream        61  0.370 
## 6 female Torgersen    24  0.145 
## 7 <NA>   Biscoe        5  0.455 
## 8 <NA>   Dream         1  0.0909
## 9 <NA>   Torgersen     5  0.455

6.5.2.1 同じ文字を含む列名を1つにまとめる

  • それぞれのtabyl結果はアウトカム変数名の列名が異なる
  • 単純にbind_rowsすると変数名列が横に増えていく
  • それを回避するため,tabyl結果が入ったlistに,それぞれ同じ名前にrenamaeしていく
library(psychTools) # bfiの格納パッケージ

ac <- 
  bfi |>
  select(A1,A2,C1,C2) |> 
  names()  

# 上で定義した変数についてnest_tabyl実行
ac_res <- 
map(all_of(ac), \(x) nest_tabyl(bfi, x, gender)) |> 
  set_names(ac) # 各要素に項目名の名前を設定


# 各リスト要素の上から3つを表示
map(ac_res, \(x) head(x, 3))
## $A1
## # A tibble: 3 × 5
##   gender    A1     n percent valid_percent
##    <int> <int> <int>   <dbl>         <dbl>
## 1      1     1   202   0.220         0.220
## 2      1     2   284   0.309         0.309
## 3      1     3   160   0.174         0.174
## 
## $A2
## # A tibble: 3 × 5
##   gender    A2     n percent valid_percent
##    <int> <int> <int>   <dbl>         <dbl>
## 1      1     1    26  0.0283        0.0286
## 2      1     2    63  0.0686        0.0694
## 3      1     3    67  0.0729        0.0738
## 
## $C1
## # A tibble: 3 × 5
##   gender    C1     n percent valid_percent
##    <int> <int> <int>   <dbl>         <dbl>
## 1      1     1    23  0.0250        0.0252
## 2      1     2    53  0.0577        0.0581
## 3      1     3   103  0.112         0.113 
## 
## $C2
## # A tibble: 3 × 5
##   gender    C2     n percent valid_percent
##    <int> <int> <int>   <dbl>         <dbl>
## 1      1     1    31  0.0337        0.0340
## 2      1     2    95  0.103         0.104 
## 3      1     3   112  0.122         0.123
# 各変数名を統一した名前にrename
ac_res <- 
ac_res |> 
  map(\(x) rename_with(x,
                   \(x) "response",               # 統一名をここに
                   starts_with(c("A", "C")))
      )

# 各リスト要素の上から3つを表示
map(ac_res, \(x) head(x, 3))
## $A1
## # A tibble: 3 × 5
##   gender response     n percent valid_percent
##    <int>    <int> <int>   <dbl>         <dbl>
## 1      1        1   202   0.220         0.220
## 2      1        2   284   0.309         0.309
## 3      1        3   160   0.174         0.174
## 
## $A2
## # A tibble: 3 × 5
##   gender response     n percent valid_percent
##    <int>    <int> <int>   <dbl>         <dbl>
## 1      1        1    26  0.0283        0.0286
## 2      1        2    63  0.0686        0.0694
## 3      1        3    67  0.0729        0.0738
## 
## $C1
## # A tibble: 3 × 5
##   gender response     n percent valid_percent
##    <int>    <int> <int>   <dbl>         <dbl>
## 1      1        1    23  0.0250        0.0252
## 2      1        2    53  0.0577        0.0581
## 3      1        3   103  0.112         0.113 
## 
## $C2
## # A tibble: 3 × 5
##   gender response     n percent valid_percent
##    <int>    <int> <int>   <dbl>         <dbl>
## 1      1        1    31  0.0337        0.0340
## 2      1        2    95  0.103         0.104 
## 3      1        3   112  0.122         0.123
# 縦に連結
ac_res |> 
  bind_rows(.id = "items")
## # A tibble: 56 × 6
##    items gender response     n percent valid_percent
##    <chr>  <int>    <int> <int>   <dbl>         <dbl>
##  1 A1         1        1   202 0.220          0.220 
##  2 A1         1        2   284 0.309          0.309 
##  3 A1         1        3   160 0.174          0.174 
##  4 A1         1        4   139 0.151          0.151 
##  5 A1         1        5    99 0.108          0.108 
##  6 A1         1        6    34 0.0370         0.0370
##  7 A1         1       NA     1 0.00109       NA     
##  8 A1         2        1   720 0.383          0.386 
##  9 A1         2        2   534 0.284          0.286 
## 10 A1         2        3   242 0.129          0.130 
## # ℹ 46 more rows

6.5.2.2 複数層をみるためmtcarsで

# アウトカム:cyl, gear;層:vs
quos(cyl, gear) |> 
  map( \(x) nest_tabyl(mtcars, !!x, vs))
## [[1]]
## # A tibble: 5 × 4
##      vs   cyl     n percent
##   <dbl> <dbl> <int>   <dbl>
## 1     0     4     1  0.0556
## 2     0     6     3  0.167 
## 3     0     8    14  0.778 
## 4     1     4    10  0.714 
## 5     1     6     4  0.286 
## 
## [[2]]
## # A tibble: 6 × 4
##      vs  gear     n percent
##   <dbl> <dbl> <int>   <dbl>
## 1     0     3    12  0.667 
## 2     0     4     2  0.111 
## 3     0     5     4  0.222 
## 4     1     3     3  0.214 
## 5     1     4    10  0.714 
## 6     1     5     1  0.0714
# アウトカム:cyl, gear;層:vs x am
quos(cyl, gear) |> 
  map( \(x) nest_tabyl(mtcars, !!x, c(vs,am)))
## [[1]]
## # A tibble: 7 × 5
##      vs    am   cyl     n percent
##   <dbl> <dbl> <dbl> <int>   <dbl>
## 1     0     1     4     1   0.167
## 2     0     1     6     3   0.5  
## 3     0     1     8     2   0.333
## 4     1     1     4     7   1    
## 5     1     0     4     3   0.429
## 6     1     0     6     4   0.571
## 7     0     0     8    12   1    
## 
## [[2]]
## # A tibble: 7 × 5
##      vs    am  gear     n percent
##   <dbl> <dbl> <dbl> <int>   <dbl>
## 1     0     1     4     2   0.333
## 2     0     1     5     4   0.667
## 3     1     1     4     6   0.857
## 4     1     1     5     1   0.143
## 5     1     0     3     3   0.429
## 6     1     0     4     4   0.571
## 7     0     0     3    12   1

6.5.3 複数層別と複数アウトカム

groups <- quos(NULL, NULL, vs, am)
outcomes <- quos(cyl, gear, cyl, gear) # 層別と数をそろえるため対応する対象を繰り返す
  
  
map2(outcomes, groups,  \(x, y) nest_tabyl(mtcars, !!x, !!y))
## [[1]]
## # A tibble: 3 × 3
##     cyl     n percent
##   <dbl> <int>   <dbl>
## 1     4    11   0.344
## 2     6     7   0.219
## 3     8    14   0.438
## 
## [[2]]
## # A tibble: 3 × 3
##    gear     n percent
##   <dbl> <int>   <dbl>
## 1     3    15   0.469
## 2     4    12   0.375
## 3     5     5   0.156
## 
## [[3]]
## # A tibble: 5 × 4
##      vs   cyl     n percent
##   <dbl> <dbl> <int>   <dbl>
## 1     0     4     1  0.0556
## 2     0     6     3  0.167 
## 3     0     8    14  0.778 
## 4     1     4    10  0.714 
## 5     1     6     4  0.286 
## 
## [[4]]
## # A tibble: 4 × 4
##      am  gear     n percent
##   <dbl> <dbl> <int>   <dbl>
## 1     1     4     8   0.615
## 2     1     5     5   0.385
## 3     0     3    15   0.789
## 4     0     4     4   0.211

6.5.3.1 確認

tabyl(mtcars, vs, cyl) |> 
  adorn_percentages()
##  vs          4         6         8
##   0 0.05555556 0.1666667 0.7777778
##   1 0.71428571 0.2857143 0.0000000

6.5.4 欠損がある場合

7 rlangについて

7.1 curly-curly {{}}

7.1.1 rlang 0.4.0以降から

mean_by <- function(df, by, var){
df |> 
  group_by({{by}}) |> 
  summarise(mean = mean({{var}}, na.rm =  TRUE))
}

mean_by(penguins, species, bill_length_mm)
## # A tibble: 3 × 2
##   species    mean
##   <fct>     <dbl>
## 1 Adelie     38.8
## 2 Chinstrap  48.8
## 3 Gentoo     47.5

7.1.1.1 {{}}は!!enquo()のショートカット

mean_by2 <- function(df, by, var){
df |> 
  group_by(!!enquo(by)) |> 
  summarise(mean = mean(!!enquo(var), na.rm =  TRUE))
}

mean_by2(penguins, species, bill_length_mm)
## # A tibble: 3 × 2
##   species    mean
##   <fct>     <dbl>
## 1 Adelie     38.8
## 2 Chinstrap  48.8
## 3 Gentoo     47.5

7.1.2 作成する変数名にglueを利用する

  • :=はセイウチ演算子(walrus operator)というらしい
mean_by3 <- function(df, by, var, prefix = "mean"){
  df |>
    group_by({{ by }}) |>
    summarise("{prefix}_{{ var }}" := mean({{ var }}, na.rm = TRUE))
}

mean_by3(penguins, species, bill_length_mm, prefix = "mean")
## # A tibble: 3 × 2
##   species   mean_bill_length_mm
##   <fct>                   <dbl>
## 1 Adelie                   38.8
## 2 Chinstrap                48.8
## 3 Gentoo                   47.5

8 map関連

8.1 データフレーム内の複数の変数に一度に関数を適用

# 使用する変数格納
vars <-
  mtcars |> 
  select(vs,am,gear) |> 
  names()

map(vars, \(x) count(mtcars, .data[[x]])) |> 
  set_names(vars)                             # listの要素名付与
## $vs
##   vs  n
## 1  0 18
## 2  1 14
## 
## $am
##   am  n
## 1  0 19
## 2  1 13
## 
## $gear
##   gear  n
## 1    3 15
## 2    4 12
## 3    5  5
# 割合も
map(vars, \(x) count(mtcars, .data[[x]])) |> 
  set_names(vars) |> 
map(\(x) mutate(x, percent = n/sum(n)))  
## $vs
##   vs  n percent
## 1  0 18  0.5625
## 2  1 14  0.4375
## 
## $am
##   am  n percent
## 1  0 19 0.59375
## 2  1 13 0.40625
## 
## $gear
##   gear  n percent
## 1    3 15 0.46875
## 2    4 12 0.37500
## 3    5  5 0.15625

8.2 リストの各要素のデータフレームで特定の変数をrename

# renameしたい変数の型を因子型に
mtcars_f <- 
  mtcars |> 
  mutate(across(all_of(vars),
                factor))

tables <- 
map(vars, \(x) count(mtcars_f, .data[[x]], .drop = FALSE)) |> 
  set_names(vars) |> 
map(\(x) mutate(x, percent = n/sum(n)))

# 各変数に対し関数を適用(tidyselect 1.2.0 >= でwarningが出るように)
# tables <- 
# map(vars, \(x) tabyl(mtcars_f, .data[[x]])) |> 
#   set_names(vars)

# 各変数のtabyl結果で連番作成
tables <- 
  map(tables, \(x) mutate(x, vid = row_number()))

# 各リスト要素の変数名の列名に文字を追加
tables <- 
tables |> 
    map(\(x) rename_with(x, 
                     \(y) str_c(y, "_test"),
                     starts_with("per")))

# purrr 1.0.0より前
# tables <- 
#   tables |> 
#   map(~rename_with(., 
#                       ~ str_c(., "_test"),
#                       starts_with("per")))

# 各リスト要素の変数名の列を同じ名前に
tables <- 
  tables |> 
  map(\(x) rename_with(x, 
                       ~ "levels",
                       where(is.factor)))

# purrr 1.0.0より前
# tables <- 
#   tables |> 
#   map(~rename_with(., 
#                       ~ "levels",
#                       where(is.factor)))

bind_rows(tables, .id = "var_name")
##   var_name levels  n percent_test vid
## 1       vs      0 18      0.56250   1
## 2       vs      1 14      0.43750   2
## 3       am      0 19      0.59375   1
## 4       am      1 13      0.40625   2
## 5     gear      3 15      0.46875   1
## 6     gear      4 12      0.37500   2
## 7     gear      5  5      0.15625   3

8.3 imap & rename_with

df_2020 <- tribble(
~id, ~var1, ~var2,
1, 1, 0,
2, 0, 1,
3, 1, 1
)

df_2021 <- tribble(
~id, ~var1, ~var2,
1, 0, 0,
2, 1, 0,
3, 1, 1
)

# データフレームをリストにまとめて要素名をつける
df_list <- 
  list(df_2020, df_2021) |> 
  set_names("y20", "y21")

# リスト内のデータフレームのid以外の変数に要素名の接尾辞をつける
df_list <- 
 imap(df_list,
       function(a,b){
         rename_with(a,
          \(x) {str_c(x, b, sep = "_")}, !id)}
      )

df_list
## $y20
## # A tibble: 3 × 3
##      id var1_y20 var2_y20
##   <dbl>    <dbl>    <dbl>
## 1     1        1        0
## 2     2        0        1
## 3     3        1        1
## 
## $y21
## # A tibble: 3 × 3
##      id var1_y21 var2_y21
##   <dbl>    <dbl>    <dbl>
## 1     1        0        0
## 2     2        1        0
## 3     3        1        1
# リスト内のデータフレームを横に連結

df_list |>
  reduce(full_join, by = "id")
## # A tibble: 3 × 5
##      id var1_y20 var2_y20 var1_y21 var2_y21
##   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
## 1     1        1        0        0        0
## 2     2        0        1        1        0
## 3     3        1        1        1        1

8.4 回帰:複数のアウトカムに同じ説明変数

# アウトカム変数名格納
outcomes <- 
  penguins |> 
  select(ends_with("_mm")) |> 
  names()
  
# 回帰
res_l <- 
  str_c(outcomes, "~ species*sex") |>
  map(\(x) lm(as.formula(x), data = penguins)) |> 
  set_names(outcomes)

# 結果の要約

  res_l |> 
  map(\(x) broom::tidy(x, conf.int = TRUE)) |> 
  bind_rows(.id = "outcome") |> 
    print(n = 18)
## # A tibble: 18 × 8
##    outcome       term  estimate std.error statistic   p.value conf.low conf.high
##    <chr>         <chr>    <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 bill_length_… (Int…  37.3       0.271    137.    2.30e-291  36.7       37.8  
##  2 bill_length_… spec…   9.32      0.481     19.4   3.06e- 56   8.37      10.3  
##  3 bill_length_… spec…   8.31      0.407     20.4   3.18e- 60   7.50       9.11 
##  4 bill_length_… sexm…   3.13      0.383      8.17  6.64e- 15   2.38       3.89 
##  5 bill_length_… spec…   1.39      0.680      2.04  4.21e-  2   0.0501     2.73 
##  6 bill_length_… spec…   0.777     0.572      1.36  1.75e-  1  -0.348      1.90 
##  7 bill_depth_mm (Int…  17.6       0.0976   180.    0          17.4       17.8  
##  8 bill_depth_mm spec…  -0.0337    0.173     -0.194 8.46e-  1  -0.374      0.307
##  9 bill_depth_mm spec…  -3.38      0.147    -23.1   1.55e- 70  -3.67      -3.10 
## 10 bill_depth_mm sexm…   1.45      0.138     10.5   1.96e- 22   1.18       1.72 
## 11 bill_depth_mm spec…   0.214     0.245      0.874 3.83e-  1  -0.268      0.696
## 12 bill_depth_mm spec…   0.0294    0.206      0.143 8.87e-  1  -0.376      0.435
## 13 flipper_leng… (Int… 188.        0.662    284.    0         186.       189.   
## 14 flipper_leng… spec…   3.94      1.17       3.36  8.84e-  4   1.63       6.25 
## 15 flipper_leng… spec…  24.9       0.995     25.0   4.93e- 78  23.0       26.9  
## 16 flipper_leng… sexm…   4.62      0.936      4.93  1.30e-  6   2.77       6.46 
## 17 flipper_leng… spec…   3.56      1.66       2.14  3.28e-  2   0.293      6.83 
## 18 flipper_leng… spec…   4.22      1.40       3.02  2.74e-  3   1.47       6.97

9 要約いろいろ

9.1 psych::describe

  • 数値の要約には便利
psych::describe(penguins)
##                   vars   n    mean     sd  median trimmed    mad    min    max
## species*             1 344    1.92   0.89    2.00    1.90   1.48    1.0    3.0
## island*              2 344    1.66   0.73    2.00    1.58   1.48    1.0    3.0
## bill_length_mm       3 342   43.92   5.46   44.45   43.91   7.04   32.1   59.6
## bill_depth_mm        4 342   17.15   1.97   17.30   17.17   2.22   13.1   21.5
## flipper_length_mm    5 342  200.92  14.06  197.00  200.34  16.31  172.0  231.0
## body_mass_g          6 342 4201.75 801.95 4050.00 4154.01 889.56 2700.0 6300.0
## sex*                 7 333    1.50   0.50    2.00    1.51   0.00    1.0    2.0
## year                 8 344 2008.03   0.82 2008.00 2008.04   1.48 2007.0 2009.0
##                    range  skew kurtosis    se
## species*             2.0  0.16    -1.73  0.05
## island*              2.0  0.61    -0.91  0.04
## bill_length_mm      27.5  0.05    -0.89  0.30
## bill_depth_mm        8.4 -0.14    -0.92  0.11
## flipper_length_mm   59.0  0.34    -1.00  0.76
## body_mass_g       3600.0  0.47    -0.74 43.36
## sex*                 1.0 -0.02    -2.01  0.03
## year                 2.0 -0.05    -1.51  0.04

9.2 skimr

9.2.1 ヒストグラムなし

library(skimr)

skim_without_charts(penguins) |> 
  as_tibble()
## # A tibble: 8 × 14
##   skim_type skim_variable n_missing complete_rate factor.ordered factor.n_unique
##   <chr>     <chr>             <int>         <dbl> <lgl>                    <int>
## 1 factor    species               0         1     FALSE                        3
## 2 factor    island                0         1     FALSE                        3
## 3 factor    sex                  11         0.968 FALSE                        2
## 4 numeric   bill_length_…         2         0.994 NA                          NA
## 5 numeric   bill_depth_mm         2         0.994 NA                          NA
## 6 numeric   flipper_leng…         2         0.994 NA                          NA
## 7 numeric   body_mass_g           2         0.994 NA                          NA
## 8 numeric   year                  0         1     NA                          NA
## # ℹ 8 more variables: factor.top_counts <chr>, numeric.mean <dbl>,
## #   numeric.sd <dbl>, numeric.p0 <dbl>, numeric.p25 <dbl>, numeric.p50 <dbl>,
## #   numeric.p75 <dbl>, numeric.p100 <dbl>

9.2.2 型を限定

skim(penguins) |> 
yank("numeric") |> 
  as_tibble()
## # A tibble: 5 × 11
##   skim_variable     n_missing complete_rate   mean      sd     p0    p25    p50
##   <chr>                 <int>         <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
## 1 bill_length_mm            2         0.994   43.9   5.46    32.1   39.2   44.4
## 2 bill_depth_mm             2         0.994   17.2   1.97    13.1   15.6   17.3
## 3 flipper_length_mm         2         0.994  201.   14.1    172    190    197  
## 4 body_mass_g               2         0.994 4202.  802.    2700   3550   4050  
## 5 year                      0         1     2008.    0.818 2007   2007   2008  
## # ℹ 3 more variables: p75 <dbl>, p100 <dbl>, hist <chr>

9.2.3 nを表示

skimn <- skim_with(base = sfl(n = \(x) sum(!is.na(x))))

skimn(penguins)
Data summary
Name penguins
Number of rows 344
Number of columns 8
_______________________
Column type frequency:
factor 3
numeric 5
________________________
Group variables None

Variable type: factor

skim_variable n ordered n_unique top_counts
species 344 FALSE 3 Ade: 152, Gen: 124, Chi: 68
island 344 FALSE 3 Bis: 168, Dre: 124, Tor: 52
sex 333 FALSE 2 mal: 168, fem: 165

Variable type: numeric

skim_variable n mean sd p0 p25 p50 p75 p100 hist
bill_length_mm 342 43.92 5.46 32.1 39.23 44.45 48.5 59.6 ▃▇▇▆▁
bill_depth_mm 342 17.15 1.97 13.1 15.60 17.30 18.7 21.5 ▅▅▇▇▂
flipper_length_mm 342 200.92 14.06 172.0 190.00 197.00 213.0 231.0 ▂▇▃▅▂
body_mass_g 342 4201.75 801.95 2700.0 3550.00 4050.00 4750.0 6300.0 ▃▇▆▃▂
year 344 2008.03 0.82 2007.0 2007.00 2008.00 2009.0 2009.0 ▇▁▇▁▇

9.2.4 層別

penguins |> 
  group_by(sex) |> 
  skim() |> 
  as_tibble()
## # A tibble: 21 × 16
##    skim_type skim_variable  sex    n_missing complete_rate factor.ordered
##    <chr>     <chr>          <fct>      <int>         <dbl> <lgl>         
##  1 factor    species        female         0         1     FALSE         
##  2 factor    species        male           0         1     FALSE         
##  3 factor    species        <NA>           0         1     FALSE         
##  4 factor    island         female         0         1     FALSE         
##  5 factor    island         male           0         1     FALSE         
##  6 factor    island         <NA>           0         1     FALSE         
##  7 numeric   bill_length_mm female         0         1     NA            
##  8 numeric   bill_length_mm male           0         1     NA            
##  9 numeric   bill_length_mm <NA>           2         0.818 NA            
## 10 numeric   bill_depth_mm  female         0         1     NA            
## # ℹ 11 more rows
## # ℹ 10 more variables: factor.n_unique <int>, factor.top_counts <chr>,
## #   numeric.mean <dbl>, numeric.sd <dbl>, numeric.p0 <dbl>, numeric.p25 <dbl>,
## #   numeric.p50 <dbl>, numeric.p75 <dbl>, numeric.p100 <dbl>,
## #   numeric.hist <chr>

9.2.4.1 層別に特定の変数の平均とSD

penguins |> 
  group_by(species, sex) |> 
  skim(bill_length_mm, bill_depth_mm) |> # selectを先にもってきてもよい
  as_tibble()
## # A tibble: 16 × 14
##    skim_type skim_variable  species   sex   n_missing complete_rate numeric.mean
##    <chr>     <chr>          <fct>     <fct>     <int>         <dbl>        <dbl>
##  1 numeric   bill_length_mm Adelie    fema…         0         1             37.3
##  2 numeric   bill_length_mm Adelie    male          0         1             40.4
##  3 numeric   bill_length_mm Adelie    <NA>          1         0.833         37.8
##  4 numeric   bill_length_mm Chinstrap fema…         0         1             46.6
##  5 numeric   bill_length_mm Chinstrap male          0         1             51.1
##  6 numeric   bill_length_mm Gentoo    fema…         0         1             45.6
##  7 numeric   bill_length_mm Gentoo    male          0         1             49.5
##  8 numeric   bill_length_mm Gentoo    <NA>          1         0.8           45.6
##  9 numeric   bill_depth_mm  Adelie    fema…         0         1             17.6
## 10 numeric   bill_depth_mm  Adelie    male          0         1             19.1
## 11 numeric   bill_depth_mm  Adelie    <NA>          1         0.833         18.3
## 12 numeric   bill_depth_mm  Chinstrap fema…         0         1             17.6
## 13 numeric   bill_depth_mm  Chinstrap male          0         1             19.3
## 14 numeric   bill_depth_mm  Gentoo    fema…         0         1             14.2
## 15 numeric   bill_depth_mm  Gentoo    male          0         1             15.7
## 16 numeric   bill_depth_mm  Gentoo    <NA>          1         0.8           14.6
## # ℹ 7 more variables: numeric.sd <dbl>, numeric.p0 <dbl>, numeric.p25 <dbl>,
## #   numeric.p50 <dbl>, numeric.p75 <dbl>, numeric.p100 <dbl>,
## #   numeric.hist <chr>

9.2.4.2 層別にnを表示

my_skim <- skim_with(base = sfl(n = length))

penguins |> 
  group_by(species, sex) |> 
  my_skim(bill_length_mm) |>
  as_tibble()
## # A tibble: 8 × 13
##   skim_type skim_variable species sex       n numeric.mean numeric.sd numeric.p0
##   <chr>     <chr>         <fct>   <fct> <int>        <dbl>      <dbl>      <dbl>
## 1 numeric   bill_length_… Adelie  fema…    73         37.3       2.03       32.1
## 2 numeric   bill_length_… Adelie  male     73         40.4       2.28       34.6
## 3 numeric   bill_length_… Adelie  <NA>      6         37.8       2.80       34.1
## 4 numeric   bill_length_… Chinst… fema…    34         46.6       3.11       40.9
## 5 numeric   bill_length_… Chinst… male     34         51.1       1.56       48.5
## 6 numeric   bill_length_… Gentoo  fema…    58         45.6       2.05       40.9
## 7 numeric   bill_length_… Gentoo  male     61         49.5       2.72       44.4
## 8 numeric   bill_length_… Gentoo  <NA>      5         45.6       1.37       44.5
## # ℹ 5 more variables: numeric.p25 <dbl>, numeric.p50 <dbl>, numeric.p75 <dbl>,
## #   numeric.p100 <dbl>, numeric.hist <chr>

9.2.5 longに

to_long(penguins)
## # A tibble: 65 × 4
##    skim_type skim_variable     stat          formatted
##    <chr>     <chr>             <chr>         <chr>    
##  1 factor    species           n_missing     0        
##  2 factor    island            n_missing     0        
##  3 factor    sex               n_missing     11       
##  4 numeric   bill_length_mm    n_missing     2        
##  5 numeric   bill_depth_mm     n_missing     2        
##  6 numeric   flipper_length_mm n_missing     2        
##  7 numeric   body_mass_g       n_missing     2        
##  8 numeric   year              n_missing     0        
##  9 factor    species           complete_rate 1        
## 10 factor    island            complete_rate 1        
## # ℹ 55 more rows

9.3 Data Explorer

https://rpubs.com/mark_sch7/DataExplorerPackage

# devtools::install_github("boxuancui/DataExplorer")
library(DataExplorer)

9.3.1 グラフ

9.3.1.1 頻度の棒グラフ

plot_bar(penguins)

9.3.1.2 ヒストグラム

plot_histogram(penguins)

10 数字

10.1 eの表記を数値に

bnum <- 10000000000

bnum
## [1] 1e+10
format(bnum, scientific = FALSE)
## [1] "10000000000"

10.1.1 設定自体を変更

options(scipen=999)

bnum
## [1] 10000000000
# 戻す
options(scipen = 0)

bnum
## [1] 1e+10

10.2 小数点

10.2.1 四捨五入のルール

http://cse.naro.affrc.go.jp/takezawa/r-tips/r/37.html

round(1.5)
## [1] 2
round(2.5)
## [1] 2

10.2.2 切り捨てと切り上げ

round(0.123456, 5) # 普通の丸め
## [1] 0.12346
round(0.123454, 5) # 普通の丸め
## [1] 0.12345
# library(plyr) # dplyr読んだ後に読み込むとdplyrの関数で動かなくなるのがあるので読むならtidyverseの先に
plyr::round_any(0.123456, 0.00001, floor)   # 切り捨て
## [1] 0.12345
plyr::round_any(0.123454, 0.00001, ceiling) # 切り上げ
## [1] 0.12346

10.2.2.1 切り捨てをtruncで工夫(整数にして割り戻す)

trunc(0.123456*10^5)/10^5
## [1] 0.12345

10.2.2.2 mutateのなかで使う

# 小数第2位まで出ている変数
mtcars |> 
  select(drat, wt)
##                     drat    wt
## Mazda RX4           3.90 2.620
## Mazda RX4 Wag       3.90 2.875
## Datsun 710          3.85 2.320
## Hornet 4 Drive      3.08 3.215
## Hornet Sportabout   3.15 3.440
## Valiant             2.76 3.460
## Duster 360          3.21 3.570
## Merc 240D           3.69 3.190
## Merc 230            3.92 3.150
## Merc 280            3.92 3.440
## Merc 280C           3.92 3.440
## Merc 450SE          3.07 4.070
## Merc 450SL          3.07 3.730
## Merc 450SLC         3.07 3.780
## Cadillac Fleetwood  2.93 5.250
## Lincoln Continental 3.00 5.424
## Chrysler Imperial   3.23 5.345
## Fiat 128            4.08 2.200
## Honda Civic         4.93 1.615
## Toyota Corolla      4.22 1.835
## Toyota Corona       3.70 2.465
## Dodge Challenger    2.76 3.520
## AMC Javelin         3.15 3.435
## Camaro Z28          3.73 3.840
## Pontiac Firebird    3.08 3.845
## Fiat X1-9           4.08 1.935
## Porsche 914-2       4.43 2.140
## Lotus Europa        3.77 1.513
## Ford Pantera L      4.22 3.170
## Ferrari Dino        3.62 2.770
## Maserati Bora       3.54 3.570
## Volvo 142E          4.11 2.780
# 変数上書き
10.2.2.2.1 変数上書き
mtcars |> 
  mutate(across(c(drat,wt),
                \(x) plyr::round_any(x, 0.1, floor))) |> 
  select(drat, wt) |> 
  head()
##                   drat  wt
## Mazda RX4          3.9 2.6
## Mazda RX4 Wag      3.9 2.8
## Datsun 710         3.8 2.3
## Hornet 4 Drive     3.0 3.2
## Hornet Sportabout  3.1 3.4
## Valiant            2.7 3.4
10.2.2.2.2 変数追加
mtcars |> 
  select(drat, wt) |>
  mutate(across(c(drat,wt),
                list(fl = \(x) plyr::round_any(x, 0.1, floor)))) |> 
    head()
##                   drat    wt drat_fl wt_fl
## Mazda RX4         3.90 2.620     3.9   2.6
## Mazda RX4 Wag     3.90 2.875     3.9   2.8
## Datsun 710        3.85 2.320     3.8   2.3
## Hornet 4 Drive    3.08 3.215     3.0   3.2
## Hornet Sportabout 3.15 3.440     3.1   3.4
## Valiant           2.76 3.460     2.7   3.4
10.2.2.2.3 trunc
mtcars |> 
  mutate(across(c(drat,wt),
                \(x) trunc(x*10)/10)) |> 
  select(drat, wt) |> 
  head()
##                   drat  wt
## Mazda RX4          3.9 2.6
## Mazda RX4 Wag      3.9 2.8
## Datsun 710         3.8 2.3
## Hornet 4 Drive     3.0 3.2
## Hornet Sportabout  3.1 3.4
## Valiant            2.7 3.4

11 文字

11.1 str_replace

  • 置換する文字対の一覧を名前付きベクトルで作成
moji_vec <- 
 as.character(1:3) |>                 # ベクトルの値: 変更先の文字
  set_names(c("one", "two", "three")) # 名前: 元の文字

moji_vec
##   one   two three 
##   "1"   "2"   "3"
x <- c("item_one", "item_two", "item_three", "item_two_three")
x
## [1] "item_one"       "item_two"       "item_three"     "item_two_three"
str_replace_all(x, 
            moji_vec)
## [1] "item_1"   "item_2"   "item_3"   "item_2_3"

11.2 str_squish テキスト中の複数空白を1つに

  • 文字列の中に複数空白があっても1つだけにする
str_squish("あ い  う    え       お")
## [1] "あ い う え お"

11.3 str_replace 特定の文字列以降を削除

word <- "このテキストより後を削除"

str_replace("文字列 このテキストより後を削除  削除部分",
           str_c(word, ".*"), word)
## [1] "文字列 このテキストより後を削除"

11.4 変数名, 列名, column

11.4.1 出力いろいろ

11.4.1.1 そのまま文字ベクトルとして

vars <- 
  penguins |>
  names()

vars
## [1] "species"           "island"            "bill_length_mm"   
## [4] "bill_depth_mm"     "flipper_length_mm" "body_mass_g"      
## [7] "sex"               "year"

11.4.1.2 文字ベクトルの入力時の形式として

dput(vars)
## c("species", "island", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", 
## "body_mass_g", "sex", "year")

constructiveパッケージ

constructive::construct(vars)
## c(
##   "species", "island", "bill_length_mm", "bill_depth_mm", "flipper_length_mm",
##   "body_mass_g", "sex", "year"
## )

11.4.1.3 1つの文字列として

cat(vars)
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year

11.4.1.4 ” “で囲まないベクトルとして

noquote(vars)
## [1] species           island            bill_length_mm    bill_depth_mm    
## [5] flipper_length_mm body_mass_g       sex               year

https://cynkra.github.io/constructive/

12 list

12.1 長さの違う要素を持つlistを2列のデータフレームにまとめる

test_l <- 
  list(a = c(1,2),
       b = c(1,2,3),
       c = c(1,2,3,4))

stack(test_l)
##   values ind
## 1      1   a
## 2      2   a
## 3      1   b
## 4      2   b
## 5      3   b
## 6      1   c
## 7      2   c
## 8      3   c
## 9      4   c

13 データフレーム

13.1 差分の確認

  • joinした後に.xと.yの接尾辞がついた変数の差分作成

13.1.1 joinする2つのデータフレーム作成

p1 <-
  penguins |>
    select(species, starts_with("bill")) |> 
    slice(1:3) |> 
    mutate(id = row_number())
  
p2 <-
  penguins |>
    select(species, starts_with("bill")) |> 
    slice(4:6) |> 
    mutate(id = row_number())
p1_2 <-
  p1 |>
    left_join(p2, by = join_by(id, species))

p1_2
## # A tibble: 3 × 6
##   species bill_length_mm.x bill_depth_mm.x    id bill_length_mm.y
##   <fct>              <dbl>           <dbl> <int>            <dbl>
## 1 Adelie              39.1            18.7     1             NA  
## 2 Adelie              39.5            17.4     2             36.7
## 3 Adelie              40.3            18       3             39.3
## # ℹ 1 more variable: bill_depth_mm.y <dbl>

13.1.2 関数を作成して実行

# 対象となる変数名のベクトル作成
var_name <- 
  penguins |>
    select(starts_with("bill")) |> 
    names()
  
  
# 差分を計算する関数作成

make_dif <- function(df, var){
  df |>
    mutate("dif_,{var}" :=
             !!sym(str_c(var, ".x")) - !!sym(str_c(var, ".y"))
    )
}

make_dif2 <- function(df, var){
  df |>
    mutate("dif_{var}" := 
             !!sym(str_c(var, ".x")) - !!sym(str_c(var, ".y"))
    )
}

# 動く
# make_dif <- function(df, var){
#     df |>
#       mutate(!!sym(str_c("dif_",{{var}})) :=
#                     !!sym(str_c(var, ".x")) - !!sym(str_c(var, ".y")))
#   }
# 
# make_dif2 <- function(df, var){
#     df |>
#       mutate("dif_{{var}}" := !!sym(str_c(var, ".x")) - !!sym(str_c(var, ".y")))
#   }

# 動く
# make_dif <- function(df, var){
#     df |>
#       mutate(!!sym(str_c("dif_",{{var}})) :=
#                     !!sym(str_c({{var}}, ".x")) - !!sym(str_c({{var}}, ".y")))
#   }
# 
# make_dif2 <- function(df, var){
#     df |>
#       mutate("dif_{{var}}" := !!sym(str_c({{var}}, ".x")) - !!sym(str_c({{var}}, ".y")))
#   }

# 動く
# make_di2 <- function(df, var){
# var1 <- sym(str_c({{var}}, ".x"))
# var2 <- sym(str_c({{var}}, ".y"))
#       df |>
#       mutate("dif_{{var}}" := !!var1 - !!var2)
#   }

# 動く
# make_di2 <- function(df, var){
# var1 <- sym("{{var}}.x")
# var2 <- sym("{{var}}.y")
#       df |>
#       mutate("dif_{{var}}" := !!var1 - !!var2)
#   }


# 動かない
# make_dif3 <- function(df, var){
# df |> 
#   mutate("dif_{{var}}" := !!sym("{{var}}.x") - !!sym("{{var}}.y"))
# }

# 動かない
# make_dif3 <- function(df, var){
#   form <- "{{var}}.x - {{var}}.y"
# df |>
#   mutate("dif_{{var}}" := !! rlang::parse_expr(form) )
# }


  
p1_2 |>
  reduce(var_name, make_dif, .init = _) |> 
  select(id, species, contains("bill_length"), contains("bill_depth")) |> 
    glimpse()
## Rows: 3
## Columns: 8
## $ id                    <int> 1, 2, 3
## $ species               <fct> Adelie, Adelie, Adelie
## $ bill_length_mm.x      <dbl> 39.1, 39.5, 40.3
## $ bill_length_mm.y      <dbl> NA, 36.7, 39.3
## $ `dif_,bill_length_mm` <dbl> NA, 2.8, 1.0
## $ bill_depth_mm.x       <dbl> 18.7, 17.4, 18.0
## $ bill_depth_mm.y       <dbl> NA, 19.3, 20.6
## $ `dif_,bill_depth_mm`  <dbl> NA, -1.9, -2.6
p1_2 |>
  reduce(var_name, make_dif2, .init = _) |> 
  select(id, species, contains("bill_length"), contains("bill_depth")) |> 
    glimpse()
## Rows: 3
## Columns: 8
## $ id                 <int> 1, 2, 3
## $ species            <fct> Adelie, Adelie, Adelie
## $ bill_length_mm.x   <dbl> 39.1, 39.5, 40.3
## $ bill_length_mm.y   <dbl> NA, 36.7, 39.3
## $ dif_bill_length_mm <dbl> NA, 2.8, 1.0
## $ bill_depth_mm.x    <dbl> 18.7, 17.4, 18.0
## $ bill_depth_mm.y    <dbl> NA, 19.3, 20.6
## $ dif_bill_depth_mm  <dbl> NA, -1.9, -2.6
# p1_2 |>
#   reduce(var_name, make_dif3, .init = .) |>
#   select(id, species, contains("bill_length"), contains("bill_depth")) |>
#     glimpse()

13.1.3 longにしてから差分作成

  • r-wakalangで教えてもらった
  • サンプルサイズが大きすぎなければこちらの方がシンプル
p1_long <- 
  pivot_longer(p1, !c(species, id), 
               values_to = "old")

p2_long <- 
  pivot_longer(p2, !c(species, id), 
               values_to = "new")

p1_2_long <-
  p1_long |>
    left_join(p2_long, by = c("id", "species", "name"))
  
p1_2_long |>
   mutate(diff = old - new) |>
    pivot_wider(
      names_from = name,
      values_from = old:diff,
      names_glue = "{name}_{.value}"
    )  |> 
  select(id, species, contains("bill_length"), contains("bill_depth")) |>   
    glimpse()
## Rows: 3
## Columns: 8
## $ id                  <int> 1, 2, 3
## $ species             <fct> Adelie, Adelie, Adelie
## $ bill_length_mm_old  <dbl> 39.1, 39.5, 40.3
## $ bill_length_mm_new  <dbl> NA, 36.7, 39.3
## $ bill_length_mm_diff <dbl> NA, 2.8, 1.0
## $ bill_depth_mm_old   <dbl> 18.7, 17.4, 18.0
## $ bill_depth_mm_new   <dbl> NA, 19.3, 20.6
## $ bill_depth_mm_diff  <dbl> NA, -1.9, -2.6

13.2 再現可能なデータ例の作成

constructive::construct(penguins |> head(2))
## tibble::tibble(
##   species = factor(c("Adelie", "Adelie"), levels = c("Adelie", "Chinstrap", "Gentoo")),
##   island = factor(c("Torgersen", "Torgersen"), levels = c("Biscoe", "Dream", "Torgersen")),
##   bill_length_mm = c(39.1, 39.5),
##   bill_depth_mm = c(18.7, 17.4),
##   flipper_length_mm = c(181L, 186L),
##   body_mass_g = c(3750L, 3800L),
##   sex = factor(c("male", "female")),
##   year = c(2007L, 2007L),
## )

14 データのサイズ

14.1 Rで使用しているメモリ量

pryr::mem_used()
## 129 MB

14.2 オブジェクトのサイズ

test <- c(1:10000)
pryr::object_size(test)
## 40.05 kB

15 RCTの解析

  • パッケージHSAUR3をインストール

15.1 データ

btheb <- 
  HSAUR3::BtheB |> 
  clean_names() |>
  as_tibble()

# id付ける
btheb <- 
  btheb |> 
  rowid_to_column(var = "id") 

btheb
## # A tibble: 100 × 9
##       id drug  length treatment bdi_pre bdi_2m bdi_3m bdi_5m bdi_8m
##    <int> <fct> <fct>  <fct>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1     1 No    >6m    TAU            29      2      2     NA     NA
##  2     2 Yes   >6m    BtheB          32     16     24     17     20
##  3     3 Yes   <6m    TAU            25     20     NA     NA     NA
##  4     4 No    >6m    BtheB          21     17     16     10      9
##  5     5 Yes   >6m    BtheB          26     23     NA     NA     NA
##  6     6 Yes   <6m    BtheB           7      0      0      0      0
##  7     7 Yes   <6m    TAU            17      7      7      3      7
##  8     8 No    >6m    TAU            20     20     21     19     13
##  9     9 Yes   <6m    BtheB          18     13     14     20     11
## 10    10 Yes   >6m    BtheB          20      5      5      8     12
## # ℹ 90 more rows

15.2 wideからlongに

btheb_long <- 
  btheb |> 
  pivot_longer(!c(id:treatment),
               names_to = "time",
               names_prefix = "bdi_", # timeの値から"bdi_"を削除
               values_to = "bdi")

# timeをfactorにして順序指定
btheb_long <- 
  btheb_long |> 
  mutate(time = fct_relevel(time, "pre", "2m", "3m", "5m"))



btheb_long
## # A tibble: 500 × 6
##       id drug  length treatment time    bdi
##    <int> <fct> <fct>  <fct>     <fct> <dbl>
##  1     1 No    >6m    TAU       pre      29
##  2     1 No    >6m    TAU       2m        2
##  3     1 No    >6m    TAU       3m        2
##  4     1 No    >6m    TAU       5m       NA
##  5     1 No    >6m    TAU       8m       NA
##  6     2 Yes   >6m    BtheB     pre      32
##  7     2 Yes   >6m    BtheB     2m       16
##  8     2 Yes   >6m    BtheB     3m       24
##  9     2 Yes   >6m    BtheB     5m       17
## 10     2 Yes   >6m    BtheB     8m       20
## # ℹ 490 more rows

15.3 mixed model

15.3.1 mixed modelの解説

15.3.2 モデル作成と実行

mm <- 
  lme4::lmer(bdi ~ treatment*time + (1 | id), 
             data = btheb_long)

summary(mm)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bdi ~ treatment * time + (1 | id)
##    Data: btheb_long
## 
## REML criterion at convergence: 2632.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1787 -0.5372 -0.1079  0.4510  3.3967 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 83.59    9.143   
##  Residual             36.44    6.037   
## Number of obs: 380, groups:  id, 100
## 
## Fixed effects:
##                       Estimate Std. Error t value
## (Intercept)             24.188      1.581  15.295
## treatmentBtheB          -1.649      2.193  -0.752
## time2m                  -4.497      1.267  -3.551
## time3m                  -6.115      1.375  -4.448
## time5m                  -7.585      1.485  -5.108
## time8m                 -10.454      1.566  -6.676
## treatmentBtheB:time2m   -3.330      1.734  -1.920
## treatmentBtheB:time3m   -3.573      1.922  -1.859
## treatmentBtheB:time5m   -3.284      2.088  -1.573
## treatmentBtheB:time8m   -1.181      2.172  -0.543
## 
## Correlation of Fixed Effects:
##             (Intr) trtmBB time2m time3m time5m time8m trBB:2 trBB:3 trBB:5
## tretmntBthB -0.721                                                        
## time2m      -0.379  0.273                                                 
## time3m      -0.349  0.252  0.456                                          
## time5m      -0.323  0.233  0.422  0.430                                   
## time8m      -0.307  0.221  0.401  0.408  0.408                            
## trtmntBtB:2  0.277 -0.384 -0.731 -0.333 -0.309 -0.293                     
## trtmntBtB:3  0.250 -0.346 -0.326 -0.715 -0.308 -0.292  0.449              
## trtmntBtB:5  0.230 -0.319 -0.300 -0.306 -0.711 -0.290  0.413  0.423       
## trtmntBtB:8  0.221 -0.306 -0.289 -0.294 -0.294 -0.721  0.397  0.406  0.407

15.3.3 tidyな出力

broom.mixed::tidy(mm, conf.int = TRUE)
## # A tibble: 12 × 8
##    effect   group    term        estimate std.error statistic conf.low conf.high
##    <chr>    <chr>    <chr>          <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 fixed    <NA>     (Intercept)    24.2       1.58    15.3      21.1    27.3   
##  2 fixed    <NA>     treatmentB…    -1.65      2.19    -0.752    -5.95    2.65  
##  3 fixed    <NA>     time2m         -4.50      1.27    -3.55     -6.98   -2.01  
##  4 fixed    <NA>     time3m         -6.11      1.37    -4.45     -8.81   -3.42  
##  5 fixed    <NA>     time5m         -7.58      1.49    -5.11    -10.5    -4.67  
##  6 fixed    <NA>     time8m        -10.5       1.57    -6.68    -13.5    -7.38  
##  7 fixed    <NA>     treatmentB…    -3.33      1.73    -1.92     -6.73    0.0686
##  8 fixed    <NA>     treatmentB…    -3.57      1.92    -1.86     -7.34    0.193 
##  9 fixed    <NA>     treatmentB…    -3.28      2.09    -1.57     -7.38    0.808 
## 10 fixed    <NA>     treatmentB…    -1.18      2.17    -0.543    -5.44    3.08  
## 11 ran_pars id       sd__(Inter…     9.14     NA       NA        NA      NA     
## 12 ran_pars Residual sd__Observ…     6.04     NA       NA        NA      NA

15.3.4 icc

lme4::VarCorr(mm) |>
  as_tibble() |>
  mutate(icc = vcov/sum(vcov))
## # A tibble: 2 × 6
##   grp      var1        var2   vcov sdcor   icc
##   <chr>    <chr>       <chr> <dbl> <dbl> <dbl>
## 1 id       (Intercept) <NA>   83.6  9.14 0.696
## 2 Residual <NA>        <NA>   36.4  6.04 0.304

15.3.5 グラフ

btheb_long |> 
    mutate(id = factor(id)) |> 
    filter(id %in% c(1:20)) |> # IDが1~20までに限定
ggplot(aes(time, bdi, 
           group = id)) + # color = id
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(size = 1) +
  theme_minimal() +
  facet_wrap(vars(treatment))

15.3.6 対比 contrast

emmeans::emmeans(mm, specs = trt.vs.ctrl ~ time:treatment, adjust = "none")
## $emmeans
##  time treatment emmean   SE  df lower.CL upper.CL
##  pre  TAU         24.2 1.58 151    21.06     27.3
##  2m   TAU         19.7 1.61 158    16.51     22.9
##  3m   TAU         18.1 1.69 186    14.73     21.4
##  5m   TAU         16.6 1.79 216    13.08     20.1
##  8m   TAU         13.7 1.85 239    10.08     17.4
##  pre  BtheB       22.5 1.52 151    19.54     25.5
##  2m   BtheB       14.7 1.52 151    11.71     17.7
##  3m   BtheB       12.9 1.65 193     9.60     16.1
##  5m   BtheB       11.7 1.75 229     8.22     15.1
##  8m   BtheB       10.9 1.78 240     7.39     14.4
## 
## Degrees-of-freedom method: satterthwaite 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast            estimate   SE  df t.ratio p.value
##  2m TAU - pre TAU       -4.50 1.27 275  -3.551  0.0005
##  3m TAU - pre TAU       -6.11 1.37 279  -4.448  <.0001
##  5m TAU - pre TAU       -7.58 1.49 280  -5.108  <.0001
##  8m TAU - pre TAU      -10.45 1.57 280  -6.676  <.0001
##  pre BtheB - pre TAU    -1.65 2.19 151  -0.752  0.4532
##  2m BtheB - pre TAU     -9.48 2.19 151  -4.321  <.0001
##  3m BtheB - pre TAU    -11.34 2.28 171  -4.967  <.0001
##  5m BtheB - pre TAU    -12.52 2.36 190  -5.309  <.0001
##  8m BtheB - pre TAU    -13.28 2.38 196  -5.576  <.0001
## 
## Degrees-of-freedom method: satterthwaite
# 邪道だけど,絶対どうしてもp値を求められている場合
# lmerTest::lmer(bdi ~ treatment*time + (1 | id), 
#               data = btheb_long) |>
#   anova()

15.4 cohen’s d

  • EMAtools::lme.dscore()
    • lmerTestの結果のtとdfから算出

\[ d = \frac{2t}{\sqrt{df}} \]

15.4.1 手動で確認

mmt <- 
  lmerTest::lmer(bdi ~ treatment*time + (1 | id), 
             data = btheb_long)

# 交互作用項だけに
summary(mmt)$coefficients |> 
  as_tibble(rownames = "terms") |> 
  filter(terms == "treatmentBtheB:time8m")
## # A tibble: 1 × 6
##   terms                 Estimate `Std. Error`    df `t value` `Pr(>|t|)`
##   <chr>                    <dbl>        <dbl> <dbl>     <dbl>      <dbl>
## 1 treatmentBtheB:time8m    -1.18         2.17  280.    -0.543      0.587
  • (2*-0.543)/sqrt(280) = -0.0649009

15.4.2 参考:通常の群間のCohen’s d

15.4.2.1 平均値

btheb |> 
  group_by(treatment) |> 
  summarise(mean = mean(bdi_8m, na.rm = TRUE),
            sd = sd(bdi_8m, na.rm = TRUE),
            n = sum(!is.na(bdi_8m)))
## # A tibble: 2 × 4
##   treatment  mean    sd     n
##   <fct>     <dbl> <dbl> <int>
## 1 TAU       13.6  11.5     25
## 2 BtheB      8.85  6.09    27

15.4.2.2 Cohen’s d

15.4.2.2.1 compute.es
compute.es::mes(13.6, 8.85, 11.5, 6.09, 25, 27)
## Mean Differences ES: 
##  
##  d [ 95 %CI] = 0.52 [ -0.03 , 1.08 ] 
##   var(d) = 0.08 
##   p-value(d) = 0.07 
##   U3(d) = 69.92 % 
##   CLES(d) = 64.4 % 
##   Cliff's Delta = 0.29 
##  
##  g [ 95 %CI] = 0.51 [ -0.03 , 1.06 ] 
##   var(g) = 0.08 
##   p-value(g) = 0.07 
##   U3(g) = 69.65 % 
##   CLES(g) = 64.19 % 
##  
##  Correlation ES: 
##  
##  r [ 95 %CI] = 0.26 [ -0.02 , 0.5 ] 
##   var(r) = 0.02 
##   p-value(r) = 0.07 
##  
##  z [ 95 %CI] = 0.26 [ -0.02 , 0.54 ] 
##   var(z) = 0.02 
##   p-value(z) = 0.07 
##  
##  Odds Ratio ES: 
##  
##  OR [ 95 %CI] = 2.58 [ 0.95 , 7.03 ] 
##   p-value(OR) = 0.07 
##  
##  Log OR [ 95 %CI] = 0.95 [ -0.06 , 1.95 ] 
##   var(lOR) = 0.26 
##   p-value(Log OR) = 0.07 
##  
##  Other: 
##  
##  NNT = 5.73 
##  Total N = 52
15.4.2.2.2 effectsize
effectsize::cohens_d(bdi_8m ~ treatment, data = btheb)
## Cohen's d |        95% CI
## -------------------------
## 0.52      | [-0.03, 1.07]
## 
## - Estimated using pooled SD.

15.5 Jacobson-Truax and reliable change indices

# data.frame型にする必要あり
btheb <- 
  btheb |> 
  as.data.frame()

# インストール
# devtools::install_github("AWKruijt/JT-RCI")

# jacobson & truaxのreliable and clinically significant change
JTRCI::JTRCI(data = btheb, 
      pre = "bdi_pre", post = "bdi_8m",
      group = "treatment",
      reliability = .75,
      JTcrit = "C",
      normM = 13.7, normSD = 12.2 # 健康な集団の平均値とSD,
      # dysfM = , dysfSD = 
        )
##    Jacobson-Truax classification    TAU  BtheB
##                           <char> <char> <char>
## 1:                  deteriorated      0      0
## 2:                     unchanged      7      2
## 3:                      improved      1      0
## 4:        non reliably recovered      8     15
## 5:                     recovered      9     10

15.5.1 個別の結果を表示

  • preとpostで欠損がリストワイズされてることに注意
  • RCI > 1.96 がよい
  • RCI = change_abs/Sdiff
JTRCIdf |> head()
##   ppid group pre post change_abs SEmeasurement    Sdiff    crittype critval
## 1    1 BtheB  32   20        -12      4.804502 6.794592 criterion C 18.9132
## 2    2 BtheB  21    9        -12      4.804502 6.794592 criterion C 18.9132
## 3    3 BtheB   7    0         -7      4.804502 6.794592 criterion C 18.9132
## 4    4   TAU  17    7        -10      4.804502 6.794592 criterion C 18.9132
## 5    5   TAU  20   13         -7      4.804502 6.794592 criterion C 18.9132
## 6    6 BtheB  18   11         -7      4.804502 6.794592 criterion C 18.9132
##         RCI            class_JTRCI
## 1 -1.766111              unchanged
## 2 -1.766111 non reliably recovered
## 3 -1.030231 non reliably recovered
## 4 -1.471759 non reliably recovered
## 5 -1.030231 non reliably recovered
## 6 -1.030231 non reliably recovered

15.5.1.1 別パッケージでRCI算出

  • suddengainsパッケージ
# メタ分析による再検査信頼性
test_retest_bdi <- 0.75

# 8mの欠損を除外したtime1でのBDIのSD
bdi_pre_sd <- 
  btheb |> 
  drop_na(bdi_8m) |> 
  summarise(sd(bdi_pre, na.rm = TRUE)) |> 
  pull()

# RCI算出
suddengains::define_crit1_cutoff(sd = bdi_pre_sd,
                    reliability = test_retest_bdi)
## $sd
## [1] 9.609004
## 
## $reliability
## [1] 0.75
## 
## $standard_error_measurement
## [1] 4.804502
## 
## $standard_error_difference
## [1] 6.794592
## 
## $reliable_change_value
## [1] 13.3174

15.5.1.2 RCI手計算

# pre SD
bdi_pre_sd
## [1] 9.609004
# standard error
se_index <- 
bdi_pre_sd*sqrt(1-test_retest_bdi)
se_index
## [1] 4.804502
# standar error difference
se_d <- 
  sqrt(2*se_index^2)
se_d
## [1] 6.794592
# pre - postが次の値を超えるとreliableな値

1.96*se_d
## [1] 13.3174

16 csv読み込み

16.1 基本

library(vroom)
vroom("data/penguins.csv")
## # A tibble: 344 × 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <chr>   <chr>              <dbl>         <dbl>             <dbl>       <dbl>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           NA            NA                  NA          NA
##  5 Adelie  Torgersen           36.7          19.3               193        3450
##  6 Adelie  Torgersen           39.3          20.6               190        3650
##  7 Adelie  Torgersen           38.9          17.8               181        3625
##  8 Adelie  Torgersen           39.2          19.6               195        4675
##  9 Adelie  Torgersen           34.1          18.1               193        3475
## 10 Adelie  Torgersen           42            20.2               190        4250
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <chr>, year <dbl>

16.2 読み込む変数選択

16.2.1 列を特定

vroom("data/penguins.csv",
      col_select = c(species, sex))
## # A tibble: 344 × 2
##    species sex   
##    <chr>   <chr> 
##  1 Adelie  male  
##  2 Adelie  female
##  3 Adelie  female
##  4 Adelie  <NA>  
##  5 Adelie  female
##  6 Adelie  male  
##  7 Adelie  female
##  8 Adelie  male  
##  9 Adelie  <NA>  
## 10 Adelie  <NA>  
## # ℹ 334 more rows

16.2.2 ヘルパー関数

vroom("data/penguins.csv",
      col_select = starts_with("bill"))
## # A tibble: 344 × 2
##    bill_length_mm bill_depth_mm
##             <dbl>         <dbl>
##  1           39.1          18.7
##  2           39.5          17.4
##  3           40.3          18  
##  4           NA            NA  
##  5           36.7          19.3
##  6           39.3          20.6
##  7           38.9          17.8
##  8           39.2          19.6
##  9           34.1          18.1
## 10           42            20.2
## # ℹ 334 more rows

16.2.3 読まない列

vroom("data/penguins.csv",
      col_select = !c(species, island))
## # A tibble: 344 × 6
##    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex     year
##             <dbl>         <dbl>             <dbl>       <dbl> <chr>  <dbl>
##  1           39.1          18.7               181        3750 male    2007
##  2           39.5          17.4               186        3800 female  2007
##  3           40.3          18                 195        3250 female  2007
##  4           NA            NA                  NA          NA <NA>    2007
##  5           36.7          19.3               193        3450 female  2007
##  6           39.3          20.6               190        3650 male    2007
##  7           38.9          17.8               181        3625 female  2007
##  8           39.2          19.6               195        4675 male    2007
##  9           34.1          18.1               193        3475 <NA>    2007
## 10           42            20.2               190        4250 <NA>    2007
## # ℹ 334 more rows
vroom("data/penguins.csv",
      col_select = list(!c(starts_with("bill"), starts_with("s"))))
## # A tibble: 344 × 4
##    island    flipper_length_mm body_mass_g  year
##    <chr>                 <dbl>       <dbl> <dbl>
##  1 Torgersen               181        3750  2007
##  2 Torgersen               186        3800  2007
##  3 Torgersen               195        3250  2007
##  4 Torgersen                NA          NA  2007
##  5 Torgersen               193        3450  2007
##  6 Torgersen               190        3650  2007
##  7 Torgersen               181        3625  2007
##  8 Torgersen               195        4675  2007
##  9 Torgersen               193        3475  2007
## 10 Torgersen               190        4250  2007
## # ℹ 334 more rows

16.3 読み込む型を指定

16.3.1 変数ごとに指定

  • 因子型 “f”
  • skip “_”
  • 文字型 “c”
vroom("data/penguins.csv",
      col_types = c(species = "f", island = "_", bill_length_mm = "c"))
## # A tibble: 344 × 7
##    species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex   
##    <fct>   <chr>                  <dbl>             <dbl>       <dbl> <chr> 
##  1 Adelie  39.1                    18.7               181        3750 male  
##  2 Adelie  39.5                    17.4               186        3800 female
##  3 Adelie  40.3                    18                 195        3250 female
##  4 Adelie  <NA>                    NA                  NA          NA <NA>  
##  5 Adelie  36.7                    19.3               193        3450 female
##  6 Adelie  39.3                    20.6               190        3650 male  
##  7 Adelie  38.9                    17.8               181        3625 female
##  8 Adelie  39.2                    19.6               195        4675 male  
##  9 Adelie  34.1                    18.1               193        3475 <NA>  
## 10 Adelie  42                      20.2               190        4250 <NA>  
## # ℹ 334 more rows
## # ℹ 1 more variable: year <dbl>

16.3.2 すべて文字型で

vroom("data/penguins.csv",
      col_types = c(.default = "c"))
## # A tibble: 344 × 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <chr>   <chr>     <chr>          <chr>         <chr>             <chr>      
##  1 Adelie  Torgersen 39.1           18.7          181               3750       
##  2 Adelie  Torgersen 39.5           17.4          186               3800       
##  3 Adelie  Torgersen 40.3           18            195               3250       
##  4 Adelie  Torgersen <NA>           <NA>          <NA>              <NA>       
##  5 Adelie  Torgersen 36.7           19.3          193               3450       
##  6 Adelie  Torgersen 39.3           20.6          190               3650       
##  7 Adelie  Torgersen 38.9           17.8          181               3625       
##  8 Adelie  Torgersen 39.2           19.6          195               4675       
##  9 Adelie  Torgersen 34.1           18.1          193               3475       
## 10 Adelie  Torgersen 42             20.2          190               4250       
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <chr>, year <chr>

17 dplyrあれこれ

17.1 select

17.1.1 変数の並びをアルファベット順にソートする

  • magrittr %>% じゃないと動かない
df <- 
  tribble(~item10, ~item1, ~item100,  ~item5, ~id,
                10,     1,      100,       5,   1) # データの値に特に意味はない

df %>% 
  select(sort(names(.)))
## # A tibble: 1 × 5
##      id item1 item10 item100 item5
##   <dbl> <dbl>  <dbl>   <dbl> <dbl>
## 1     1     1     10     100     5
  • ただし,数字の並びは昇順にならない

17.1.2 変数の並びを数字順にする

df |> 
  select(id, num_range("item", 1:100))
## # A tibble: 1 × 5
##      id item1 item5 item10 item100
##   <dbl> <dbl> <dbl>  <dbl>   <dbl>
## 1     1     1     5     10     100

17.2 rowごとの処理

17.2.1 そのまま

df <- 
  tibble(a = c(1,  1,  1),
         b = c(NA, 2,  2),
         c = c(NA, NA, 3))

df |> 
  mutate(total = a + b + c)
## # A tibble: 3 × 4
##       a     b     c total
##   <dbl> <dbl> <dbl> <dbl>
## 1     1    NA    NA    NA
## 2     1     2    NA    NA
## 3     1     2     3     6
df |> 
  mutate(total = sum(c(a, b, c), na.rm = TRUE))
## # A tibble: 3 × 4
##       a     b     c total
##   <dbl> <dbl> <dbl> <dbl>
## 1     1    NA    NA    10
## 2     1     2    NA    10
## 3     1     2     3    10
df |> 
  mutate(mean_abc = mean(c(a,b,c), na.rm = TRUE))
## # A tibble: 3 × 4
##       a     b     c mean_abc
##   <dbl> <dbl> <dbl>    <dbl>
## 1     1    NA    NA     1.67
## 2     1     2    NA     1.67
## 3     1     2     3     1.67
# 非NAの6セルで合計10を割る
10/6
## [1] 1.666667

17.2.2 rowwise

17.2.2.1 sum

df |> 
  rowwise() |> 
  mutate(total = sum(c(a, b, c), na.rm = TRUE))
## # A tibble: 3 × 4
## # Rowwise: 
##       a     b     c total
##   <dbl> <dbl> <dbl> <dbl>
## 1     1    NA    NA     1
## 2     1     2    NA     3
## 3     1     2     3     6

17.2.2.2 mean

df |> 
  rowwise() |> 
  mutate(mean_abc = mean(c(a, b, c), na.rm = TRUE))
## # A tibble: 3 × 4
## # Rowwise: 
##       a     b     c mean_abc
##   <dbl> <dbl> <dbl>    <dbl>
## 1     1    NA    NA      1  
## 2     1     2    NA      1.5
## 3     1     2     3      2

17.2.2.3 selection helperを使う

df |> 
  rowwise() |> 
  mutate(mean_abc = mean(c_across(a:c), na.rm = TRUE))
## # A tibble: 3 × 4
## # Rowwise: 
##       a     b     c mean_abc
##   <dbl> <dbl> <dbl>    <dbl>
## 1     1    NA    NA      1  
## 2     1     2    NA      1.5
## 3     1     2     3      2
df |> 
  rowwise() |> 
  mutate(mean_abc = mean(c_across(everything()), na.rm = TRUE))
## # A tibble: 3 × 4
## # Rowwise: 
##       a     b     c mean_abc
##   <dbl> <dbl> <dbl>    <dbl>
## 1     1    NA    NA      1  
## 2     1     2    NA      1.5
## 3     1     2     3      2

17.2.3 rowSumsとrowMean

  • こっちの方が早い
df |> 
  mutate(total = rowSums(pick(a:c), na.rm = TRUE))
## # A tibble: 3 × 4
##       a     b     c total
##   <dbl> <dbl> <dbl> <dbl>
## 1     1    NA    NA     1
## 2     1     2    NA     3
## 3     1     2     3     6
df |> 
  mutate(total = rowMeans(pick(a:c), na.rm = TRUE))
## # A tibble: 3 × 4
##       a     b     c total
##   <dbl> <dbl> <dbl> <dbl>
## 1     1    NA    NA   1  
## 2     1     2    NA   1.5
## 3     1     2     3   2
# dplyr < 1.1.0
# df |> 
#   mutate(total = rowSums(across(a:c), na.rm = TRUE))
# 
# df |> 
#   mutate(total = rowMeans(across(a:c), na.rm = TRUE))

17.3 rename

17.3.1 変更対応のリストを代入したい時は名前付きベクトルで

old <- c("species", "island", "bill_length_mm")
new <- c("種", "島", "くちばし長さ")
newvars <- setNames(old, new)

penguins |>
  rename(!!!newvars)
## # A tibble: 344 × 8
##    種     島      くちばし長さ bill_depth_mm flipper_length_mm body_mass_g sex  
##    <fct>  <fct>          <dbl>         <dbl>             <int>       <int> <fct>
##  1 Adelie Torger…         39.1          18.7               181        3750 male 
##  2 Adelie Torger…         39.5          17.4               186        3800 fema…
##  3 Adelie Torger…         40.3          18                 195        3250 fema…
##  4 Adelie Torger…         NA            NA                  NA          NA <NA> 
##  5 Adelie Torger…         36.7          19.3               193        3450 fema…
##  6 Adelie Torger…         39.3          20.6               190        3650 male 
##  7 Adelie Torger…         38.9          17.8               181        3625 fema…
##  8 Adelie Torger…         39.2          19.6               195        4675 male 
##  9 Adelie Torger…         34.1          18.1               193        3475 <NA> 
## 10 Adelie Torger…         42            20.2               190        4250 <NA> 
## # ℹ 334 more rows
## # ℹ 1 more variable: year <int>

17.3.1.1 変更対応リストがデータフレームであるとき

rename_df <- tibble(
old = c("species", "island", "bill_length_mm"),
new = c("種", "島", "くちばし長さ")
)

rename_df
## # A tibble: 3 × 2
##   old            new         
##   <chr>          <chr>       
## 1 species        種          
## 2 island         島          
## 3 bill_length_mm くちばし長さ
rename_df |> 
  relocate(new) |> # newを最初に並べ替え
deframe()
##               種               島     くちばし長さ 
##        "species"         "island" "bill_length_mm"

17.4 mutate

17.4.1 case_whenで値ではなく関数や式を指定する

tribble(~a, ~b,
        0, 1,
        1, 2,
        2, 3) |> 
  mutate(b = case_when(a == 0 ~ (\(x) x+1)(b),
                       a == 1 ~ (\(x) x+2)(b),
                       .default = b)
  )
## # A tibble: 3 × 2
##       a     b
##   <dbl> <dbl>
## 1     0     2
## 2     1     4
## 3     2     3

17.4.2 対照表を元に値を変換していく

17.4.2.1 case_whenの場合

  • 対照表なし
penguins |> 
  mutate(speciesj = case_when(
                       species == "Adelie" ~ "アデリー",
                       species == "Chinstrap" ~ "ヒゲ",
                       species == "Gentoo" ~ "ジェンツー",
                       .default = "その他"),
                       .keep = "used"
  ) |> 
  count(species, speciesj)
## # A tibble: 3 × 3
##   species   speciesj       n
##   <fct>     <chr>      <int>
## 1 Adelie    アデリー     152
## 2 Chinstrap ヒゲ          68
## 3 Gentoo    ジェンツー   124
  # group_by(species) |> 
  # slice_head(n = 2)

17.4.2.2 case_matchの場合

  • 対照表なし
penguins |> 
  mutate(speciesj = case_match(species,
                       "Adelie" ~ "アデリー",
                       "Chinstrap" ~ "ヒゲ",
                       "Gentoo" ~ "ジェンツー",
                       .default = "その他"
                      ),
                       .keep = "used"
  ) |> 
  count(species, speciesj)
## # A tibble: 3 × 3
##   species   speciesj       n
##   <fct>     <chr>      <int>
## 1 Adelie    アデリー     152
## 2 Chinstrap ヒゲ          68
## 3 Gentoo    ジェンツー   124
  # group_by(species) |> 
  # slice_head(n = 2)

17.4.2.3 left_joinでも置き換えと同様の結果を得られる

  • 対照表あり。大量の置き換えならこれが最もお手軽な気がする
    • reference table and left_join
old <- 
  penguins |> 
  count(species) |> 
  pull(species)

new <- c("アデリー", "ヒゲ",   "ジェンツー")

taisho <- 
  tibble(species = old,
         speciesj = new)

penguins |> 
  left_join(taisho) |> 
  count(species, speciesj)
## # A tibble: 3 × 3
##   species   speciesj       n
##   <fct>     <chr>      <int>
## 1 Adelie    アデリー     152
## 2 Chinstrap ヒゲ          68
## 3 Gentoo    ジェンツー   124

17.4.2.4 名前付きベクトルで置き換え

new_old <- 
  new |>  
  set_names(old)

# オブジェクト作らずnewを上書き
# names(new) <- old

penguins |> 
  mutate(speciesj = new_old[species]) |> 
  count(species, speciesj)
## # A tibble: 3 × 3
##   species   speciesj       n
##   <fct>     <chr>      <int>
## 1 Adelie    アデリー     152
## 2 Chinstrap ヒゲ          68
## 3 Gentoo    ジェンツー   124

17.4.3 複数回答が1つの列にテキストで入っている時,回答別のフラグ列を作成

17.4.3.1 サンプルデータ作成

# 区切りにスペースなし
df <- tribble(~id, ~multi_answer,
              1, "りんご,みかん,ぶどう",
              2, "りんご,ぶどう",
              3, "メロン,ぶどう",
              4, "ぶどう,りんご,みかん,メロン")

df
## # A tibble: 4 × 2
##      id multi_answer               
##   <dbl> <chr>                      
## 1     1 りんご,みかん,ぶどう       
## 2     2 りんご,ぶどう              
## 3     3 メロン,ぶどう              
## 4     4 ぶどう,りんご,みかん,メロン
# スペース入っている場合
# tribble(~id, ~multi_answer,
#         1, "りんご, みかん, ぶどう",
#         2, "りんご, ぶどう",
#         3, "メロン, ぶどう",
#         4, "ぶどう, りんご, みかん, メロン") |> 
#   mutate(multi_answer = str_remove_all(multi_answer, " ")) # スペース除去

17.4.3.2 文字列をリストに分割して新しい列に格納

df <- 
  df |> 
  mutate(multi_answer_l = str_split(multi_answer, ",")) # 区切りは","

df
## # A tibble: 4 × 3
##      id multi_answer                multi_answer_l
##   <dbl> <chr>                       <list>        
## 1     1 りんご,みかん,ぶどう        <chr [3]>     
## 2     2 りんご,ぶどう               <chr [2]>     
## 3     3 メロン,ぶどう               <chr [2]>     
## 4     4 ぶどう,りんご,みかん,メロン <chr [4]>

17.4.3.3 各列に分割

df <- 
  df |> tidyr::hoist(multi_answer_l, 
             # 列名指定   
                     f1 = 1, 
                     f2 = 2, 
                     f3 = 3, 
                     f4 = 4) 

df
## # A tibble: 4 × 6
##      id multi_answer                f1     f2     f3     f4    
##   <dbl> <chr>                       <chr>  <chr>  <chr>  <chr> 
## 1     1 りんご,みかん,ぶどう        りんご みかん ぶどう <NA>  
## 2     2 りんご,ぶどう               りんご ぶどう <NA>   <NA>  
## 3     3 メロン,ぶどう               メロン ぶどう <NA>   <NA>  
## 4     4 ぶどう,りんご,みかん,メロン ぶどう りんご みかん メロン

17.4.3.4 選択肢ごとにフラグを表す列を追加

 df |> 
  mutate(りんご = case_when(f1 == "りんご" |
                            f2 == "りんご" |
                            f3 == "りんご" |
                            f4 == "りんご" ~ 1,
                                      TRUE ~ 0),
         みかん = case_when(f1 == "みかん" |
                            f2 == "みかん" |
                            f3 == "みかん" |
                            f4 == "みかん" ~ 1,
                                      TRUE ~ 0),
         ぶどう = case_when(f1 == "ぶどう" |
                            f2 == "ぶどう" |
                            f3 == "ぶどう" |
                            f4 == "ぶどう" ~ 1,
                                      TRUE ~ 0),
         メロン = case_when(f1 == "メロン" |
                            f2 == "メロン" |
                            f3 == "メロン" |
                            f4 == "メロン" ~ 1,
                                      TRUE ~ 0)
  )
## # A tibble: 4 × 10
##      id multi_answer         f1    f2    f3    f4    りんご みかん ぶどう メロン
##   <dbl> <chr>                <chr> <chr> <chr> <chr>  <dbl>  <dbl>  <dbl>  <dbl>
## 1     1 りんご,みかん,ぶどう りん… みか… ぶど… <NA>       1      1      1      0
## 2     2 りんご,ぶどう        りん… ぶど… <NA>  <NA>       1      0      1      0
## 3     3 メロン,ぶどう        メロ… ぶど… <NA>  <NA>       0      0      1      1
## 4     4 ぶどう,りんご,みか…  ぶど… りん… みか… メロ…      1      1      1      1
17.4.3.4.1 【効率化】
17.4.3.4.1.1 関数作成
# 回答選択肢ごとに列を作る関数作成
multi_answer <- function(data, answer){
  data |> 
    mutate(!!sym({{answer}}) := case_when(f1 == answer |
                                       f2 == answer |
                                       f3 == answer |
                                       f4 == answer ~ 1,
                                                 TRUE ~ 0))
}


multi_answer(df, "りんご")
## # A tibble: 4 × 7
##      id multi_answer                f1     f2     f3     f4     りんご
##   <dbl> <chr>                       <chr>  <chr>  <chr>  <chr>   <dbl>
## 1     1 りんご,みかん,ぶどう        りんご みかん ぶどう <NA>        1
## 2     2 りんご,ぶどう               りんご ぶどう <NA>   <NA>        1
## 3     3 メロン,ぶどう               メロン ぶどう <NA>   <NA>        0
## 4     4 ぶどう,りんご,みかん,メロン ぶどう りんご みかん メロン      1
17.4.3.4.1.2 関数適用
multi_answer(df, "りんご") |> 
multi_answer("みかん") |> 
multi_answer("ぶどう") |> 
multi_answer("メロン")  
## # A tibble: 4 × 10
##      id multi_answer         f1    f2    f3    f4    りんご みかん ぶどう メロン
##   <dbl> <chr>                <chr> <chr> <chr> <chr>  <dbl>  <dbl>  <dbl>  <dbl>
## 1     1 りんご,みかん,ぶどう りん… みか… ぶど… <NA>       1      1      1      0
## 2     2 りんご,ぶどう        りん… ぶど… <NA>  <NA>       1      0      1      0
## 3     3 メロン,ぶどう        メロ… ぶど… <NA>  <NA>       0      0      1      1
## 4     4 ぶどう,りんご,みか…  ぶど… りん… みか… メロ…      1      1      1      1
17.4.3.4.2 【さらに効率化】
contents <- 
  c("りんご", "みかん", "ぶどう", "メロン")

df_multi <- 
  map(contents, \(x) multi_answer(df, x)) |> 
  reduce(full_join)

df_multi
## # A tibble: 4 × 10
##      id multi_answer         f1    f2    f3    f4    りんご みかん ぶどう メロン
##   <dbl> <chr>                <chr> <chr> <chr> <chr>  <dbl>  <dbl>  <dbl>  <dbl>
## 1     1 りんご,みかん,ぶどう りん… みか… ぶど… <NA>       1      1      1      0
## 2     2 りんご,ぶどう        りん… ぶど… <NA>  <NA>       1      0      1      0
## 3     3 メロン,ぶどう        メロ… ぶど… <NA>  <NA>       0      0      1      1
## 4     4 ぶどう,りんご,みか…  ぶど… りん… みか… メロ…      1      1      1      1

17.5 across周り

17.5.1 作成する変数の名前をコントロール

  • across()の中で.names = 引数を入れる
penguins |> 
  summarise(across(c(bill_length_mm, bill_depth_mm),
                   list(
                     mean = \(x) mean(x, na.rm = TRUE),
                     n    = \(x) sum(!is.na(x))),
                   .names = "{.fn}_{.col}"
                   )
            
            )
## # A tibble: 1 × 4
##   mean_bill_length_mm n_bill_length_mm mean_bill_depth_mm n_bill_depth_mm
##                 <dbl>            <int>              <dbl>           <int>
## 1                43.9              342               17.2             342

18 tidyrあれこれ

18.1 pivot_wider

18.1.1 基本

mtcars |> 
  count(vs)
##   vs  n
## 1  0 18
## 2  1 14
mtcars |> 
  count(vs) |> 
  pivot_wider(names_from = vs,
              values_from = n)
## # A tibble: 1 × 2
##     `0`   `1`
##   <int> <int>
## 1    18    14

names_fromに複数列指定

mtcars |> 
  count(vs) |> 
  mutate(name = "vs")
##   vs  n name
## 1  0 18   vs
## 2  1 14   vs
mtcars |> 
  count(vs) |>
  mutate(name = "vs") |> 
  pivot_wider(names_from = c(name,vs),
              values_from = n)
## # A tibble: 1 × 2
##    vs_0  vs_1
##   <int> <int>
## 1    18    14

names_glue(namesの変数1つ)

mtcars |> 
  count(vs) |>
  pivot_wider(names_from = vs,
              names_glue ="{.value}_{.name}",
              values_from = n)
## # A tibble: 1 × 2
##     n_0   n_1
##   <int> <int>
## 1    18    14

names_glue(namesの列名指定)

mtcars |> 
  count(vs) |>
  pivot_wider(names_from = vs,
              names_glue ="{.value}_{vs}",
              values_from = n)
## # A tibble: 1 × 2
##     n_0   n_1
##   <int> <int>
## 1    18    14

names_glue(namesの変数2つ)

mtcars |> 
     count(vs) |>
     mutate(name = "vs") |> 
     pivot_wider(names_from = c(name,vs),
                 names_glue ="{.value}_{.name}",
                 values_from = n)
## # A tibble: 1 × 2
##   n_vs_0 n_vs_1
##    <int>  <int>
## 1     18     14

18.1.2 (🚧作成中)関数にして複数変数への適用

19 回帰

19.1 線形

res <- 
  lm(mpg ~ ., data = mtcars)

broom::tidy(res)
## # A tibble: 11 × 5
##    term        estimate std.error statistic p.value
##    <chr>          <dbl>     <dbl>     <dbl>   <dbl>
##  1 (Intercept)  12.3      18.7        0.657  0.518 
##  2 cyl          -0.111     1.05      -0.107  0.916 
##  3 disp          0.0133    0.0179     0.747  0.463 
##  4 hp           -0.0215    0.0218    -0.987  0.335 
##  5 drat          0.787     1.64       0.481  0.635 
##  6 wt           -3.72      1.89      -1.96   0.0633
##  7 qsec          0.821     0.731      1.12   0.274 
##  8 vs            0.318     2.10       0.151  0.881 
##  9 am            2.52      2.06       1.23   0.234 
## 10 gear          0.655     1.49       0.439  0.665 
## 11 carb         -0.199     0.829     -0.241  0.812

19.1.1 標準化係数

res2 <- 
  lm.beta::lm.beta(res)

broom::tidy(res2)
## # A tibble: 11 × 6
##    term        estimate std_estimate std.error statistic p.value
##    <chr>          <dbl>        <dbl>     <dbl>     <dbl>   <dbl>
##  1 (Intercept)  12.3         NA        18.7        0.657  0.518 
##  2 cyl          -0.111       -0.0330    1.05      -0.107  0.916 
##  3 disp          0.0133       0.274     0.0179     0.747  0.463 
##  4 hp           -0.0215      -0.244     0.0218    -0.987  0.335 
##  5 drat          0.787        0.0698    1.64       0.481  0.635 
##  6 wt           -3.72        -0.603     1.89      -1.96   0.0633
##  7 qsec          0.821        0.243     0.731      1.12   0.274 
##  8 vs            0.318        0.0266    2.10       0.151  0.881 
##  9 am            2.52         0.209     2.06       1.23   0.234 
## 10 gear          0.655        0.0802    1.49       0.439  0.665 
## 11 carb         -0.199       -0.0534    0.829     -0.241  0.812

20 相関

20.1 基本

library(corrr)

mtcars |> 
  correlate()
## # A tibble: 11 × 12
##    term     mpg    cyl   disp     hp    drat     wt    qsec     vs      am
##    <chr>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>   <dbl>  <dbl>   <dbl>
##  1 mpg   NA     -0.852 -0.848 -0.776  0.681  -0.868  0.419   0.664  0.600 
##  2 cyl   -0.852 NA      0.902  0.832 -0.700   0.782 -0.591  -0.811 -0.523 
##  3 disp  -0.848  0.902 NA      0.791 -0.710   0.888 -0.434  -0.710 -0.591 
##  4 hp    -0.776  0.832  0.791 NA     -0.449   0.659 -0.708  -0.723 -0.243 
##  5 drat   0.681 -0.700 -0.710 -0.449 NA      -0.712  0.0912  0.440  0.713 
##  6 wt    -0.868  0.782  0.888  0.659 -0.712  NA     -0.175  -0.555 -0.692 
##  7 qsec   0.419 -0.591 -0.434 -0.708  0.0912 -0.175 NA       0.745 -0.230 
##  8 vs     0.664 -0.811 -0.710 -0.723  0.440  -0.555  0.745  NA      0.168 
##  9 am     0.600 -0.523 -0.591 -0.243  0.713  -0.692 -0.230   0.168 NA     
## 10 gear   0.480 -0.493 -0.556 -0.126  0.700  -0.583 -0.213   0.206  0.794 
## 11 carb  -0.551  0.527  0.395  0.750 -0.0908  0.428 -0.656  -0.570  0.0575
## # ℹ 2 more variables: gear <dbl>, carb <dbl>

20.1.1 変数絞って表示

mtcars |> 
  correlate() |> 
  focus(mpg, disp, wt, mirror = TRUE)
## # A tibble: 3 × 4
##   term     mpg   disp     wt
##   <chr>  <dbl>  <dbl>  <dbl>
## 1 mpg   NA     -0.848 -0.868
## 2 disp  -0.848 NA      0.888
## 3 wt    -0.868  0.888 NA

20.1.2 下側のみに

mtcars |> 
  correlate() |> 
  focus(mpg, disp, wt, mirror = TRUE) |> 
  shave()
## # A tibble: 3 × 4
##   term     mpg   disp    wt
##   <chr>  <dbl>  <dbl> <dbl>
## 1 mpg   NA     NA        NA
## 2 disp  -0.848 NA        NA
## 3 wt    -0.868  0.888    NA

20.1.3 ペアワイズ時のnを算出

psychTools::bfi |>
  select(A1,A2,C1,C2) |> 
  pair_n()
##      A1   A2   C1   C2
## A1 2784 2757 2764 2761
## A2 2757 2773 2752 2752
## C1 2764 2752 2779 2755
## C2 2761 2752 2755 2776
## attr(,"class")
## [1] "n_mat"  "matrix"
# データフレームに

psychTools::bfi |>
  select(A1,A2,C1,C2) |> 
  pair_n() |> 
  as_tibble()
## # A tibble: 4 × 4
##   A1      A2      C1      C2     
##   <n_mat> <n_mat> <n_mat> <n_mat>
## 1 2784    2757    2764    2761   
## 2 2757    2773    2752    2752   
## 3 2764    2752    2779    2755   
## 4 2761    2752    2755    2776

20.2 層別

mtcars |>
  select(mpg, disp, wt, cyl) |>
  nest(data = !cyl) |>
  mutate(
    correlations = map(data, correlate)
  ) |>
  unnest(correlations)
## # A tibble: 9 × 6
##     cyl data              term     mpg   disp     wt
##   <dbl> <list>            <chr>  <dbl>  <dbl>  <dbl>
## 1     6 <tibble [7 × 3]>  mpg   NA      0.103 -0.682
## 2     6 <tibble [7 × 3]>  disp   0.103 NA      0.473
## 3     6 <tibble [7 × 3]>  wt    -0.682  0.473 NA    
## 4     4 <tibble [11 × 3]> mpg   NA     -0.805 -0.713
## 5     4 <tibble [11 × 3]> disp  -0.805 NA      0.857
## 6     4 <tibble [11 × 3]> wt    -0.713  0.857 NA    
## 7     8 <tibble [14 × 3]> mpg   NA     -0.520 -0.650
## 8     8 <tibble [14 × 3]> disp  -0.520 NA      0.755
## 9     8 <tibble [14 × 3]> wt    -0.650  0.755 NA

20.2.1 1つのデータフレームに

mtcars |> 
  select(mpg, disp, wt, cyl) |> 
  group_by(cyl) |>
  group_modify(~ correlate(.x))
## # A tibble: 9 × 5
## # Groups:   cyl [3]
##     cyl term     mpg   disp     wt
##   <dbl> <chr>  <dbl>  <dbl>  <dbl>
## 1     4 mpg   NA     -0.805 -0.713
## 2     4 disp  -0.805 NA      0.857
## 3     4 wt    -0.713  0.857 NA    
## 4     6 mpg   NA      0.103 -0.682
## 5     6 disp   0.103 NA      0.473
## 6     6 wt    -0.682  0.473 NA    
## 7     8 mpg   NA     -0.520 -0.650
## 8     8 disp  -0.520 NA      0.755
## 9     8 wt    -0.650  0.755 NA

20.2.2 listのままにして加工してから1つのデータフレームに

# groupにする変数の水準を抜き出す
cyl_level <- 
mtcars |> 
  count(cyl) |> 
  pull(cyl)

mtcars |> 
  select(mpg, disp, wt, cyl) |> 
  group_by(cyl) |> 
  group_map(~ correlate(.)) |>  # ここは旧表記じゃないと動かない
  map(\(x) shave(x)) |>         # 相関行列の上側を
  set_names(cyl_level) |>       # group変数の水準を適用
  bind_rows(.id = "cyl_level")
## # A tibble: 9 × 5
##   cyl_level term     mpg   disp    wt
##   <chr>     <chr>  <dbl>  <dbl> <dbl>
## 1 4         mpg   NA     NA        NA
## 2 4         disp  -0.805 NA        NA
## 3 4         wt    -0.713  0.857    NA
## 4 6         mpg   NA     NA        NA
## 5 6         disp   0.103 NA        NA
## 6 6         wt    -0.682  0.473    NA
## 7 8         mpg   NA     NA        NA
## 8 8         disp  -0.520 NA        NA
## 9 8         wt    -0.650  0.755    NA

21 時間測定

21.1 tictoc

tictoc::tic()
# tic()とtoc()の間に処理を挟む
Sys.sleep(1) # 1秒待つ
tictoc::toc()
## 1.03 sec elapsed

21.1.1 タイトルをつけてログを残す

tictoc::tic("1秒待つ")
Sys.sleep(1)
tictoc::toc(log = TRUE)
## 1秒待つ: 1.02 sec elapsed
tictoc::tic("2秒待つ")
Sys.sleep(2)
tictoc::toc(log = TRUE)
## 2秒待つ: 2.03 sec elapsed

21.1.1.1 ログ出力

tictoc::tic.log() |> 
  unlist() |> 
  tibble()
## # A tibble: 2 × 1
##   `unlist(tictoc::tic.log())`
##   <chr>                      
## 1 1秒待つ: 1.02 sec elapsed  
## 2 2秒待つ: 2.03 sec elapsed

22 コードの書式修正


22.1 関連ショートカット

  • ctrl + I 再インデント(階層構造をきれいにする)
  • ctrl + shift + A 再フォーマット(複数行にする)

22.1.1 例 

mtcars |>  
select(vs,am,gear) 

23 引用関連

23.1 RとR Studioまたはパッケージの引用

23.2 R Studioでの文献引用

24 ✍Webスクレイピング

25 R Markdown

25.2 チャンクの色

# {r class.source="bg-info"}
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# {r class.source="bg-primary"}
# {r class.source="bg-success"}
# {r class.source="bg-warning"}
# {r class.source="bg-danger"}

25.2.1 出力チャンクと色分け

# {r class.source="bg-danger", class.output="bg-warning"}

head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

26 Excel関連

26.1 openxlsx

# 空のワークブックとシートを作成
library(openxlsx)
wb <- createWorkbook()
addWorksheet(wb, "ペンギン")
addWorksheet(wb, "ペンギンraw")

# フォントをデフォルトからMS Pゴシックに変更
modifyBaseFont(wb, "MS PGothic")

# データをwbに入れる
writeData(wb, sheet = "ペンギン", penguins, startCol = 1, startRow = 1)
writeData(wb, sheet = "ペンギンraw", penguins_raw, startCol = 1, startRow = 1)

# 先頭行および先頭列固定
freezePane(wb, "ペンギン", firstRow = TRUE, firstCol = TRUE) 
freezePane(wb, "ペンギンraw", firstRow = TRUE, firstCol = TRUE)


# 保存
saveWorkbook(wb, "out/penguins.xlsx", overwrite = TRUE)

27 R以外の関連メモ

27.1 画面の動画キャプチャをgifファイルに

  • 動画だけならwindowsキー + G でもできる
  • ScreenToGifを使う