1 パッケージ読み込み

library(tidyverse)
library(palmerpenguins)
library(janitor)

1.1 パッケージ入ってない場合はインストールして読み込む場合（コードのみ表示）

https://community.rstudio.com/t/is-it-a-good-idea-to-conditionally-load-install-libraries-at-the-beginning-of-a-script/15334/5

if (!require("palmerpenguins")) {
   install.packages("palmerpenguins")
   library(palmerpenguins)
}

2 github連携

2.1 githubのレポジトリをローカルに作成して管理

github上でCode>cloneでurlをコピー
RStudioでNew Project>Version Control>Git>Repositry URL にurl貼り付けてCreate Project

3 ベクトル操作

3.1 作成　

seq(1,3, by = 1)

## [1] 1 2 3

seq(1,10, by = 2)

## [1] 1 3 5 7 9

rep(1, 3)

## [1] 1 1 1

rep(1:3, 3)

## [1] 1 2 3 1 2 3 1 2 3

rep(1:3, each = 3)

## [1] 1 1 1 2 2 2 3 3 3

3.2 要素を削除

dplyr::setdiff

abcd <- c("a", "b", "c", "d")
rmitem <- c("b", "c")

abcd |> 
  setdiff(rmitem)

## [1] "a" "d"

baseの操作

abcd [! abcd %in% rmitem]

## [1] "a" "d"

3.3 要素のソート

sort(c("c", "a", "b"))

## [1] "a" "b" "c"

4 正規表現

regular expression (regexp)
参照
- https://stringr.tidyverse.org/articles/regular-expressions.html

4.1 数字,文字

4.1.1 例の作成

num <- c("1", "11", "111", "l", "|", "one", "一")

4.1.2 数字

どれも同じ

一字

str_view(num, "[0-9]")

## [1] │ <1>
## [2] │ <1><1>
## [3] │ <1><1><1>

str_view(num, "[:digit:]")

## [1] │ <1>
## [2] │ <1><1>
## [3] │ <1><1><1>

str_view(num, "\\d")

## [1] │ <1>
## [2] │ <1><1>
## [3] │ <1><1><1>

三字

str_view(num, "[0-9][0-9][0-9]")

## [3] │ <111>

4.1.3 アルファベット

アルファベットのみ

str_view(num, "[a-z]")

## [4] │ <l>
## [6] │ <o><n><e>

日本語も

str_view(num, "[:alpha:]") # 日本語も入る

## [4] │ <l>
## [6] │ <o><n><e>
## [7] │ <一>

str_view_all

str_view_all(num, "[a-z]")

## [1] │ 1
## [2] │ 11
## [3] │ 111
## [4] │ <l>
## [5] │ |
## [6] │ <o><n><e>
## [7] │ 一

文字数指定

str_view(num, "[a-z][a-z][a-z]")

## [6] │ <one>

str_view(num, "[a-z]..")

## [6] │ <one>

str_view(num, "[a-z]{3}")

## [6] │ <one>

4.1.4 数字とアルファベット

一文字

str_view(num, "[0-9a-z]")

## [1] │ <1>
## [2] │ <1><1>
## [3] │ <1><1><1>
## [4] │ <l>
## [6] │ <o><n><e>

すべて

str_view_all(num, "[0-9a-z]")

## [1] │ <1>
## [2] │ <1><1>
## [3] │ <1><1><1>
## [4] │ <l>
## [5] │ |
## [6] │ <o><n><e>
## [7] │ 一

指定文字数

str_view(num, "...")

## [3] │ <111>
## [6] │ <one>

4.2 単語のつながり

4.2.1 例の作成

text <- c("test", "test_test", "test_test_test", "testtest", 
          "test_testtest", "testtest_test", "test_test_","teest_teest")

4.2.2 繰り返し

一回

str_view(text, "(....)_\\1")

## [2] │ <test_test>
## [3] │ <test_test>_test
## [5] │ <test_test>test
## [6] │ test<test_test>
## [7] │ <test_test>_

str_view(text, "(.+)_\\1")

## [2] │ <test_test>
## [3] │ <test_test>_test
## [5] │ <test_test>test
## [6] │ test<test_test>
## [7] │ <test_test>_
## [8] │ <teest_teest>

str_view(text, "(....)\\1")

## [4] │ <testtest>
## [5] │ test_<testtest>
## [6] │ <testtest>_test

str_view(text, "(....)\\1_\\1")

## [6] │ <testtest_test>

二回

str_view(text, "(....)_\\1_\\1")

## [3] │ <test_test_test>

4.3 区分する点を探す

変数名でpivot_longerするときに

4.3.1 検討用データフレーム作成

mtcars_vsam <- 
  mtcars |> 
  summarise(across(c(vs, am),
                   list(
                     p = \(x) mean(x, na.rm = TRUE),
                     n = \(x) sum(x, na.rm = TRUE),
               total_n = \(x) sum(!is.na(x))
                   ))
            )

mtcars_vsam

##     vs_p vs_n vs_total_n    am_p am_n am_total_n
## 1 0.4375   14         32 0.40625   13         32

# purrr 1.0.0より前
# mtcars_vsam <- 
#   mtcars |> 
#   summarise(across(c(vs, am),
#                    list(
#                      p = ~mean(., na.rm = TRUE),
#                      n = ~sum(., na.rm = TRUE),
#                total_n = ~sum(!is.na(.))
#                    ))
#             )

4.3.1.1 meanとsdを列名にするpivot_longer(×な例)

total_nも列名にもってきたい

mtcars_vsam |> 
  pivot_longer(everything(),
               names_to = c("items", ".value"),
               names_pattern = "(.*)_(.*)")

## # A tibble: 4 × 3
##   items         p     n
##   <chr>     <dbl> <dbl>
## 1 vs        0.438    14
## 2 vs_total NA        32
## 3 am        0.406    13
## 4 am_total NA        32

4.3.2 変数名の文字の長さ確認

# 降順に並び替えて1行目を取得
max_value <- 
names(mtcars_vsam) |>
  str_count() |>
  as_tibble() |>
  arrange(desc(value))

max_value

## # A tibble: 6 × 1
##   value
##   <int>
## 1    10
## 2    10
## 3     4
## 4     4
## 5     4
## 6     4

# 文字数の最大値取得
max_value <- 
  max_value |> 
  slice(1)

# total_pの前までで切る情報を追加したいため、最大値-total_pの文字数計算

max_value - str_count("total_p")

##   value
## 1     3

4.3.2.1 meanとsdを列名にするpivot_longer(okな例)

mtcars_vsam |> 
  pivot_longer(everything(),
               names_to = c("items", ".value"),
               names_pattern = "(.{1,2})_(.+)") # _の前は1～2字までに限定

## # A tibble: 2 × 4
##   items     p     n total_n
##   <chr> <dbl> <dbl>   <int>
## 1 vs    0.438    14      32
## 2 am    0.406    13      32

4.3.2.1.1 {n}の挙動

https://r4ds.had.co.nz/strings.html#repetition

文字

str_view("xxxxx", ".{1}")

## [1] │ <x><x><x><x><x>

str_view("xxxxx", "x{3}")

## [1] │ <xxx>xx

x <- c("xx_xxx","x_xx_xxx_xxxx_xxxxx")

str_view(x, "x{2,4}")

## [1] │ <xx>_<xxx>
## [2] │ x_<xx>_<xxx>_<xxxx>_<xxxx>x

str_view_all(x, "x{2,4}")

## [1] │ <xx>_<xxx>
## [2] │ x_<xx>_<xxx>_<xxxx>_<xxxx>x

数字

numbers <- 
  c("123", "123456", "123456789")

str_view(numbers, ".{4,}")

## [2] │ <123456>
## [3] │ <123456789>

# str_view(numbers, ".{,9}")

str_view(numbers, ".{4,7}")

## [2] │ <123456>
## [3] │ <1234567>89

str_view(numbers, ".{4,7}?") # lazy

## [2] │ <1234>56
## [3] │ <1234><5678>9

4.4 連続量を2値化して割合を出して1つのデータフレームに

4.4.1 2値変数作成

df_peng <-
  penguins |>
  mutate(across(ends_with("mm"),
                list(m = \(x) if_else(x > mean(x, na.rm = TRUE),1,0)))
         )

4.4.2 割合とnとtotal_nを算出

df_peng_res <- 
df_peng |>
  summarise(across(ends_with("_m"),
                   list( p    = \(x) mean(x, na.rm = TRUE),
                         n    = \(x) sum(x, na.rm = TRUE),           # bill_length_mm_mが1のn
                         total_n    = \(x) sum(!is.na(x))))  # bill_length_mm_mのna以外のn（total）
  )


# 変数名の文字の長さ確認
# 降順に並び替えて1行目を取得
max_value <- 
names(df_peng_res) |>
  str_count() |>
  as_tibble() |>
  arrange(desc(value))

max_value

## # A tibble: 9 × 1
##   value
##   <int>
## 1    27
## 2    24
## 3    23
## 4    21
## 5    21
## 6    18
## 7    18
## 8    17
## 9    17

# 文字数の最大値取得
max_value <- 
  max_value |> 
  slice(1)

# total_pの前までで切る情報を追加したいため、最大値-total_pの文字数計算

max_value - str_count("total_p")

##   value
## 1    20

4.4.3 meanとsdを列名にするpivot_longer

df_peng_res |> 
pivot_longer(everything(),
               names_to = c("items", ".value"),
               names_pattern = "(.{1,20})_(.+)") # _の前は1～20字までに限定

## # A tibble: 3 × 4
##   items                   p     n total_n
##   <chr>               <dbl> <dbl>   <int>
## 1 bill_length_mm_m    0.512   175     342
## 2 bill_depth_mm_m     0.532   182     342
## 3 flipper_length_mm_m 0.433   148     342

5 度数分布とクロス表

5.1 度数分布

df <- tribble(~moji,
              "a",
              NA,
              "c",
              "a",
              "c",
              "a")

5.1.1 文字型でデフォルト

df |> count(moji)

## # A tibble: 3 × 2
##   moji      n
##   <chr> <int>
## 1 a         3
## 2 c         2
## 3 <NA>      1

df |> tabyl(moji)

##  moji n   percent valid_percent
##     a 3 0.5000000           0.6
##     c 2 0.3333333           0.4
##  <NA> 1 0.1666667            NA

5.1.1.1 NA外して

df |> 
  tabyl(moji, show_na = FALSE)

##  moji n percent
##     a 3     0.6
##     c 2     0.4

5.1.1.2 countで%

df |> 
  drop_na(moji) |> 
  count(moji) |> 
  mutate(percent = n/sum(n))

## # A tibble: 2 × 3
##   moji      n percent
##   <chr> <int>   <dbl>
## 1 a         3     0.6
## 2 c         2     0.4

5.1.2 因子型

本当はカテゴリ”b”があるがデータとしてないことを示す

df <- 
  df |> 
  mutate(moji = factor(moji, levels = c("a", "b", "c")))

df |> count(moji)

## # A tibble: 3 × 2
##   moji      n
##   <fct> <int>
## 1 a         3
## 2 c         2
## 3 <NA>      1

df |> tabyl(moji)

##  moji n   percent valid_percent
##     a 3 0.5000000           0.6
##     b 0 0.0000000           0.0
##     c 2 0.3333333           0.4
##  <NA> 1 0.1666667            NA

5.1.2.1 countでデータのない水準もn=0で示す

df |> count(moji, .drop = FALSE)

## # A tibble: 4 × 2
##   moji      n
##   <fct> <int>
## 1 a         3
## 2 b         0
## 3 c         2
## 4 <NA>      1

5.1.2.2 tabylでデータのない水準もn=0で示してNAは計算に入れずtotalも示す

df |> 
  tabyl(moji, show_na = FALSE) |> 
  adorn_totals()

##   moji n percent
##      a 3     0.6
##      b 0     0.0
##      c 2     0.4
##  Total 5     1.0

5.1.3 数字

df <- tibble(suji = c(1,1,0,0,0,NA))

df |> tabyl(suji)

##  suji n   percent valid_percent
##     0 3 0.5000000           0.6
##     1 2 0.3333333           0.4
##    NA 1 0.1666667            NA

df |> 
  summarise(percent = mean(suji, na.rm = TRUE),
            n = sum(suji, na.rm = TRUE),
            total_n = sum(!is.na(suji)))

## # A tibble: 1 × 3
##   percent     n total_n
##     <dbl> <dbl>   <int>
## 1     0.4     2       5

5.2 クロス表

5.2.1 count

mtcars |> 
  count(gear, vs)

##   gear vs  n
## 1    3  0 12
## 2    3  1  3
## 3    4  0  2
## 4    4  1 10
## 5    5  0  4
## 6    5  1  1

5.2.1.1 割合追加

gearの水準別のvsの割合(%)

mtcars |> 
  count(gear, vs) |> 
  group_by(gear) |> 
  mutate(perc = n/sum(n)*100)

## # A tibble: 6 × 4
## # Groups:   gear [3]
##    gear    vs     n  perc
##   <dbl> <dbl> <int> <dbl>
## 1     3     0    12  80  
## 2     3     1     3  20  
## 3     4     0     2  16.7
## 4     4     1    10  83.3
## 5     5     0     4  80  
## 6     5     1     1  20

5.2.2 欠損ある場合

penguins |> 
  count(species, sex)

## # A tibble: 8 × 3
##   species   sex        n
##   <fct>     <fct>  <int>
## 1 Adelie    female    73
## 2 Adelie    male      73
## 3 Adelie    <NA>       6
## 4 Chinstrap female    34
## 5 Chinstrap male      34
## 6 Gentoo    female    58
## 7 Gentoo    male      61
## 8 Gentoo    <NA>       5

5.2.2.1 割合追加

5.2.2.1.1 欠損あり

penguins |> 
  count(species, sex) |>
  group_by(species) |>
  mutate(perc = n/sum(n)*100)

## # A tibble: 8 × 4
## # Groups:   species [3]
##   species   sex        n  perc
##   <fct>     <fct>  <int> <dbl>
## 1 Adelie    female    73 48.0 
## 2 Adelie    male      73 48.0 
## 3 Adelie    <NA>       6  3.95
## 4 Chinstrap female    34 50   
## 5 Chinstrap male      34 50   
## 6 Gentoo    female    58 46.8 
## 7 Gentoo    male      61 49.2 
## 8 Gentoo    <NA>       5  4.03

5.2.2.1.2 欠損drop

penguins |> 
  drop_na(species, sex) |> 
  count(species, sex) |>
  group_by(species) |>
  mutate(perc = n/sum(n)*100)

## # A tibble: 6 × 4
## # Groups:   species [3]
##   species   sex        n  perc
##   <fct>     <fct>  <int> <dbl>
## 1 Adelie    female    73  50  
## 2 Adelie    male      73  50  
## 3 Chinstrap female    34  50  
## 4 Chinstrap male      34  50  
## 5 Gentoo    female    58  48.7
## 6 Gentoo    male      61  51.3

5.2.3 tabyl

mtcars |> 
  tabyl(gear, vs)

##  gear  0  1
##     3 12  3
##     4  2 10
##     5  4  1

5.2.3.1 割合

mtcars |> 
  tabyl(gear, vs) |> 
  adorn_percentages()

##  gear         0         1
##     3 0.8000000 0.2000000
##     4 0.1666667 0.8333333
##     5 0.8000000 0.2000000

割合とn

mtcars |> 
  tabyl(gear, vs) |> 
  adorn_percentages() |> 
  adorn_ns()

##  gear                      0                      1
##     3               0.8 (12)               0.2  (3)
##     4 0.166666666666667  (2) 0.833333333333333 (10)
##     5               0.8  (4)               0.2  (1)

割合（%）とn

mtcars |> 
  tabyl(gear, vs) |> 
  adorn_percentages() |> 
  adorn_pct_formatting() |> 
  adorn_ns()

##  gear          0          1
##     3 80.0% (12) 20.0%  (3)
##     4 16.7%  (2) 83.3% (10)
##     5 80.0%  (4) 20.0%  (1)

nと割合（%)

mtcars |> 
  tabyl(gear, vs) |> 
  adorn_percentages() |> 
  adorn_pct_formatting() |> 
  adorn_ns(position = "front")

##  gear          0          1
##     3 12 (80.0%)  3 (20.0%)
##     4  2 (16.7%) 10 (83.3%)
##     5  4 (80.0%)  1 (20.0%)

5.2.4 3要因以上

penguins |> 
  count(island, species, sex) |>
  group_by(island, species) |>
  mutate(perc = n/sum(n)*100)

## # A tibble: 13 × 5
## # Groups:   island, species [5]
##    island    species   sex        n  perc
##    <fct>     <fct>     <fct>  <int> <dbl>
##  1 Biscoe    Adelie    female    22 50   
##  2 Biscoe    Adelie    male      22 50   
##  3 Biscoe    Gentoo    female    58 46.8 
##  4 Biscoe    Gentoo    male      61 49.2 
##  5 Biscoe    Gentoo    <NA>       5  4.03
##  6 Dream     Adelie    female    27 48.2 
##  7 Dream     Adelie    male      28 50   
##  8 Dream     Adelie    <NA>       1  1.79
##  9 Dream     Chinstrap female    34 50   
## 10 Dream     Chinstrap male      34 50   
## 11 Torgersen Adelie    female    24 46.2 
## 12 Torgersen Adelie    male      23 44.2 
## 13 Torgersen Adelie    <NA>       5  9.62

# mtcars |> 
#   count(gear, vs, am) |> 
#   group_by(gear, vs) |> 
#   mutate(perc = n/sum(n)*100)

5.3 modelsummary::datasummary()

modelsummary::datasummary_crosstab(species ~ sex, data = penguins)

tinytable_u4x6x0irk2f6hksmwrr6

species		female	male	All
Adelie	N	73	73	152
	% row	48.0	48.0	100.0
Chinstrap	N	34	34	68
	% row	50.0	50.0	100.0
Gentoo	N	58	61	124
	% row	46.8	49.2	100.0
All	N	165	168	344
	% row	48.0	48.8	100.0

6 関数

https://dplyr.tidyverse.org/articles/programming.html
{{}}(curly curly; doubled braces)について
- https://www.tidyverse.org/blog/2019/06/rlang-0-4-0/#a-simpler-interpolation-pattern-with-
- https://stackoverflow.com/questions/44121728/programming-with-dplyr-using-string-as-input
{{}}とenquoの使いわけ
- https://stackoverflow.com/questions/65627873/turn-a-dplyr-double-curly-braces-interpolation-into-a-string

6.1 基本

test <- function(x){
  x + 1
  }

test(1)

## [1] 2

test_xy <- function(x,y){
  x + y
}

test_xy(1, 2)

## [1] 3

式が1行だけなら{}不要

test_simple <- function(x) x + 1

test_simple(10)

## [1] 11

6.1.1 省略形

https://www.r-bloggers.com/2021/05/new-features-in-r-4-1-0/

test_s <- \(x){
  x + 1
  }

test_s(100)

## [1] 101

test_xy_s <- \(x,y){
  x + y
}

test_xy_s(2,3)

## [1] 5

6.1.2 関数内で数値または文字

test_num <- function(x){
  print(x)
} 

test_num(1)

## [1] 1

test_chr <- function(x){
  x2 <- deparse(substitute(x))
    print(x2)
} 

test_chr(1)

## [1] "1"

6.2 無名関数 anonymous function

https://adv-r.hadley.nz/functions.html#exercises-14
functionの定義部分を( )で囲むと実行できる

(function(x) x + 1)(1)

## [1] 2

6.2.1 無名関数の省略形（R4.1以降）

(\(x) x + 1)(1)

## [1] 2

(\(x, y) x + y)(2,3)

## [1] 5

6.2.1.1 `\`（バックスラッシュ）の表示について

ブラウザでHTML開いたときに円マーク￥（本当は半角）になる場合は，ブラウザのフォント設定で固定幅フォントを英語のものに変えると解決
具体的には，Chromeで三点リーダ > 設定 > デザイン > フォントをカスタマイズ > 固定幅フォントがデフォルトでMS Gothicになっている場合，ArialとかCourier Newに変える

6.3 関数の中身確認

参考
関数を()をつけずに入力して実行

select

## function (.data, ...) 
## {
##     UseMethod("select")
## }
## <bytecode: 0x0000021e49b55b70>
## <environment: namespace:dplyr>

methods()で調べるといくつか候補が出る

methods(select)

## [1] select.data.frame* select.list       
## see '?methods' for accessing help and source code

元のパッケージに:::(注：コロン3つ)をつけてmethodsで調べた対象を入力して実行するとコードが確認できる

dplyr:::select.data.frame

## function (.data, ...) 
## {
##     error_call <- dplyr_error_call()
##     loc <- tidyselect::eval_select(expr(c(...)), data = .data, 
##         error_call = error_call)
##     loc <- ensure_group_vars(loc, .data, notify = TRUE)
##     out <- dplyr_col_select(.data, loc)
##     out <- set_names(out, names(loc))
##     out
## }
## <bytecode: 0x0000021e529122b0>
## <environment: namespace:dplyr>

または関数を使うとパッケージ名なしでもできる

getAnywhere(select.data.frame)

## A single object matching 'select.data.frame' was found
## It was found in the following places
##   registered S3 method for select from namespace dplyr
##   namespace:dplyr
## with value
## 
## function (.data, ...) 
## {
##     error_call <- dplyr_error_call()
##     loc <- tidyselect::eval_select(expr(c(...)), data = .data, 
##         error_call = error_call)
##     loc <- ensure_group_vars(loc, .data, notify = TRUE)
##     out <- dplyr_col_select(.data, loc)
##     out <- set_names(out, names(loc))
##     out
## }
## <bytecode: 0x0000021e529122b0>
## <environment: namespace:dplyr>

6.4 平均値算出

6.4.1 平均値とn

mean_n <- 
  function(data, variable){
    data |> 
      summarise(across({{variable}},
                       list(mean = \(x) mean(x, na.rm = TRUE),
                            n    = \(x) sum(!is.na(x))))
                )
  }

mean_n(penguins, where(is.numeric))

## # A tibble: 1 × 10
##   bill_length_mm_mean bill_length_mm_n bill_depth_mm_mean bill_depth_mm_n
##                 <dbl>            <int>              <dbl>           <int>
## 1                43.9              342               17.2             342
## # ℹ 6 more variables: flipper_length_mm_mean <dbl>, flipper_length_mm_n <int>,
## #   body_mass_g_mean <dbl>, body_mass_g_n <int>, year_mean <dbl>, year_n <int>

6.4.1.1

mean_ns <- 
  \(data, variable){
    data |> 
      summarise(across({{variable}},
                       list(mean = \(x) mean(x, na.rm = TRUE),
                            n    = \(x) sum(!is.na(x))))
                )
  }

mean_ns(penguins, where(is.numeric))

## # A tibble: 1 × 10
##   bill_length_mm_mean bill_length_mm_n bill_depth_mm_mean bill_depth_mm_n
##                 <dbl>            <int>              <dbl>           <int>
## 1                43.9              342               17.2             342
## # ℹ 6 more variables: flipper_length_mm_mean <dbl>, flipper_length_mm_n <int>,
## #   body_mass_g_mean <dbl>, body_mass_g_n <int>, year_mean <dbl>, year_n <int>

6.4.1.2 確認

penguins |> 
      summarise(across(where(is.numeric),
                       list(mean = \(x) mean(x, na.rm = TRUE),
                            n    = \(x) sum(!is.na(x))))
                )

## # A tibble: 1 × 10
##   bill_length_mm_mean bill_length_mm_n bill_depth_mm_mean bill_depth_mm_n
##                 <dbl>            <int>              <dbl>           <int>
## 1                43.9              342               17.2             342
## # ℹ 6 more variables: flipper_length_mm_mean <dbl>, flipper_length_mm_n <int>,
## #   body_mass_g_mean <dbl>, body_mass_g_n <int>, year_mean <dbl>, year_n <int>

6.4.2 平均値とnで列をmeanとnに

cmean_n <- 
  function(data, variable){
    data |> 
      summarise(across({{variable}},
                       list(mean = \(x) mean(x, na.rm = TRUE),
                            n    = \(x) sum(!is.na(x))))
      ) |> 
      pivot_longer(everything(),
                   names_to = c("variables", ".value"), # ".value"の部分を列名に
                   names_pattern = "(.*)_(.*)")
  }

cmean_n(penguins, where(is.numeric))

## # A tibble: 5 × 3
##   variables           mean     n
##   <chr>              <dbl> <int>
## 1 bill_length_mm      43.9   342
## 2 bill_depth_mm       17.2   342
## 3 flipper_length_mm  201.    342
## 4 body_mass_g       4202.    342
## 5 year              2008.    344

6.4.2.1 確認

penguins |> 
      summarise(across(where(is.numeric),
                       list(mean = \(x) mean(x, na.rm = TRUE),
                            n    = \(x) sum(!is.na(x))))
      ) |> 
      pivot_longer(everything(),
                   names_to = c("variables", ".value"), # ".value"の部分を列名に
                   names_pattern = "(.*)_(.*)")

## # A tibble: 5 × 3
##   variables           mean     n
##   <chr>              <dbl> <int>
## 1 bill_length_mm      43.9   342
## 2 bill_depth_mm       17.2   342
## 3 flipper_length_mm  201.    342
## 4 body_mass_g       4202.    342
## 5 year              2008.    344

6.4.3 totalとgroupの平均値とnを一気に算出

https://community.rstudio.com/t/using-map-with-a-vector-of-variables-and-dplyr-programming/93088
pickを使う理由
- across()はその後に関数を適用する仕様だから，そうじゃない場合のよりよい選択肢として
- https://www.tidyverse.org/blog/2023/02/dplyr-1-1-0-pick-reframe-arrange/

all_group = function(data, outcome_vars, group_vars = NULL) {
  data |> 
    group_by(pick({{group_vars}})) |> 
    summarise(across({{outcome_vars}}, 
                     list(mean = \(x) mean(x, na.rm = TRUE),
                          n    = \(x) sum(!is.na(x)))))
}

all_group(penguins, bill_length_mm)

## # A tibble: 1 × 2
##   bill_length_mm_mean bill_length_mm_n
##                 <dbl>            <int>
## 1                43.9              342

all_group(penguins, bill_length_mm, species)

## # A tibble: 3 × 3
##   species   bill_length_mm_mean bill_length_mm_n
##   <fct>                   <dbl>            <int>
## 1 Adelie                   38.8              151
## 2 Chinstrap                48.8               68
## 3 Gentoo                   47.5              123

# dplyr 1.1.0より前
#    group_by(across({{group_vars}})) |>

6.4.3.1 確認

penguins |> 
  summarise(across(bill_length_mm,
                   list(mean = \(x) mean(x, na.rm = TRUE),
                          n    = \(x) sum(!is.na(x)))))

## # A tibble: 1 × 2
##   bill_length_mm_mean bill_length_mm_n
##                 <dbl>            <int>
## 1                43.9              342

penguins |> 
  group_by(species) |> 
  summarise(across(bill_length_mm,
                   list(mean = \(x) mean(x, na.rm = TRUE),
                          n    = \(x) sum(!is.na(x)))))

## # A tibble: 3 × 3
##   species   bill_length_mm_mean bill_length_mm_n
##   <fct>                   <dbl>            <int>
## 1 Adelie                   38.8              151
## 2 Chinstrap                48.8               68
## 3 Gentoo                   47.5              123

6.4.3.2 mapでリストに結果を格納

quos(NULL, species) |> 
  map( \(x) all_group(penguins, bill_length_mm, !!x))

## [[1]]
## # A tibble: 1 × 2
##   bill_length_mm_mean bill_length_mm_n
##                 <dbl>            <int>
## 1                43.9              342
## 
## [[2]]
## # A tibble: 3 × 3
##   species   bill_length_mm_mean bill_length_mm_n
##   <fct>                   <dbl>            <int>
## 1 Adelie                   38.8              151
## 2 Chinstrap                48.8               68
## 3 Gentoo                   47.5              123

quos(NULL, species, island, c(species, island)) |> 
  map( \(x) all_group(penguins, bill_length_mm, !!x))

## [[1]]
## # A tibble: 1 × 2
##   bill_length_mm_mean bill_length_mm_n
##                 <dbl>            <int>
## 1                43.9              342
## 
## [[2]]
## # A tibble: 3 × 3
##   species   bill_length_mm_mean bill_length_mm_n
##   <fct>                   <dbl>            <int>
## 1 Adelie                   38.8              151
## 2 Chinstrap                48.8               68
## 3 Gentoo                   47.5              123
## 
## [[3]]
## # A tibble: 3 × 3
##   island    bill_length_mm_mean bill_length_mm_n
##   <fct>                   <dbl>            <int>
## 1 Biscoe                   45.3              167
## 2 Dream                    44.2              124
## 3 Torgersen                39.0               51
## 
## [[4]]
## # A tibble: 5 × 4
## # Groups:   species [3]
##   species   island    bill_length_mm_mean bill_length_mm_n
##   <fct>     <fct>                   <dbl>            <int>
## 1 Adelie    Biscoe                   39.0               44
## 2 Adelie    Dream                    38.5               56
## 3 Adelie    Torgersen                39.0               51
## 4 Chinstrap Dream                    48.8               68
## 5 Gentoo    Biscoe                   47.5              123

6.4.3.3 1つのデータフレームに

quos(NULL, species) |> 
  map( \(x) all_group(penguins, bill_length_mm, !!x)) |> 
  bind_rows()

## # A tibble: 4 × 3
##   bill_length_mm_mean bill_length_mm_n species  
##                 <dbl>            <int> <fct>    
## 1                43.9              342 <NA>     
## 2                38.8              151 Adelie   
## 3                48.8               68 Chinstrap
## 4                47.5              123 Gentoo

# こちらでもOK
# quos(NULL, species) |> 
#   map_dfr( \(x) all_group(penguins, bill_length_mm, !!x))

quos(NULL, species, island, c(species, island)) |> 
  map(\(x) all_group(penguins, bill_length_mm, !!x)) |> 
  bind_rows()

## # A tibble: 12 × 4
##    bill_length_mm_mean bill_length_mm_n species   island   
##                  <dbl>            <int> <fct>     <fct>    
##  1                43.9              342 <NA>      <NA>     
##  2                38.8              151 Adelie    <NA>     
##  3                48.8               68 Chinstrap <NA>     
##  4                47.5              123 Gentoo    <NA>     
##  5                45.3              167 <NA>      Biscoe   
##  6                44.2              124 <NA>      Dream    
##  7                39.0               51 <NA>      Torgersen
##  8                39.0               44 Adelie    Biscoe   
##  9                38.5               56 Adelie    Dream    
## 10                39.0               51 Adelie    Torgersen
## 11                48.8               68 Chinstrap Dream    
## 12                47.5              123 Gentoo    Biscoe

# こちらでもOK
# quos(NULL, species, island, c(species, island)) |> 
#   map_dfr( \(x) all_group(penguins, bill_length_mm, !!x))

6.5 クロス表

6.5.1 複数層別と単一のアウトカム

nest_tabyl = function(df, outcome_var, nest_vars = NULL){
df |>
  nest(data = !c({{nest_vars}})) |>
    mutate(freq =
           map(data, \(x) tabyl(x, {{outcome_var}}))) |> # show_na = FALSEで欠損無しのpercentのみ算出
  select(!data) |>
  unnest(cols = c(freq)) |>
  as_tibble()
}

# island x species
nest_tabyl(penguins, species, island)

## # A tibble: 9 × 4
##   island    species       n percent
##   <fct>     <fct>     <int>   <dbl>
## 1 Torgersen Adelie       52   1    
## 2 Torgersen Chinstrap     0   0    
## 3 Torgersen Gentoo        0   0    
## 4 Biscoe    Adelie       44   0.262
## 5 Biscoe    Chinstrap     0   0    
## 6 Biscoe    Gentoo      124   0.738
## 7 Dream     Adelie       56   0.452
## 8 Dream     Chinstrap    68   0.548
## 9 Dream     Gentoo        0   0

# speciesのみ
nest_tabyl(penguins, species)

## # A tibble: 3 × 3
##   species       n percent
##   <fct>     <int>   <dbl>
## 1 Adelie      152   0.442
## 2 Chinstrap    68   0.198
## 3 Gentoo      124   0.360

# 動かない
# nest_tabyl(penguins, c(species, island))

6.5.1.1 複数層別

# 層：全体，island
quos(NULL, island) |> 
  map( \(x) nest_tabyl(penguins, species, !!x))

## [[1]]
## # A tibble: 3 × 3
##   species       n percent
##   <fct>     <int>   <dbl>
## 1 Adelie      152   0.442
## 2 Chinstrap    68   0.198
## 3 Gentoo      124   0.360
## 
## [[2]]
## # A tibble: 9 × 4
##   island    species       n percent
##   <fct>     <fct>     <int>   <dbl>
## 1 Torgersen Adelie       52   1    
## 2 Torgersen Chinstrap     0   0    
## 3 Torgersen Gentoo        0   0    
## 4 Biscoe    Adelie       44   0.262
## 5 Biscoe    Chinstrap     0   0    
## 6 Biscoe    Gentoo      124   0.738
## 7 Dream     Adelie       56   0.452
## 8 Dream     Chinstrap    68   0.548
## 9 Dream     Gentoo        0   0

# 層：全体，island, sex, island x sex
quos(NULL, island, sex, c(island, sex)) |> 
  map( \(x) nest_tabyl(penguins, species, !!x))

## [[1]]
## # A tibble: 3 × 3
##   species       n percent
##   <fct>     <int>   <dbl>
## 1 Adelie      152   0.442
## 2 Chinstrap    68   0.198
## 3 Gentoo      124   0.360
## 
## [[2]]
## # A tibble: 9 × 4
##   island    species       n percent
##   <fct>     <fct>     <int>   <dbl>
## 1 Torgersen Adelie       52   1    
## 2 Torgersen Chinstrap     0   0    
## 3 Torgersen Gentoo        0   0    
## 4 Biscoe    Adelie       44   0.262
## 5 Biscoe    Chinstrap     0   0    
## 6 Biscoe    Gentoo      124   0.738
## 7 Dream     Adelie       56   0.452
## 8 Dream     Chinstrap    68   0.548
## 9 Dream     Gentoo        0   0    
## 
## [[3]]
## # A tibble: 9 × 4
##   sex    species       n percent
##   <fct>  <fct>     <int>   <dbl>
## 1 male   Adelie       73   0.435
## 2 male   Chinstrap    34   0.202
## 3 male   Gentoo       61   0.363
## 4 female Adelie       73   0.442
## 5 female Chinstrap    34   0.206
## 6 female Gentoo       58   0.352
## 7 <NA>   Adelie        6   0.545
## 8 <NA>   Chinstrap     0   0    
## 9 <NA>   Gentoo        5   0.455
## 
## [[4]]
## # A tibble: 27 × 5
##    island    sex    species       n percent
##    <fct>     <fct>  <fct>     <int>   <dbl>
##  1 Torgersen male   Adelie       23   1    
##  2 Torgersen male   Chinstrap     0   0    
##  3 Torgersen male   Gentoo        0   0    
##  4 Torgersen female Adelie       24   1    
##  5 Torgersen female Chinstrap     0   0    
##  6 Torgersen female Gentoo        0   0    
##  7 Torgersen <NA>   Adelie        5   1    
##  8 Torgersen <NA>   Chinstrap     0   0    
##  9 Torgersen <NA>   Gentoo        0   0    
## 10 Biscoe    female Adelie       22   0.275
## # ℹ 17 more rows

6.5.1.2 確認

penguins |>
  nest(data = !c(island)) |>
    mutate(freq =
           map(data, \(x) tabyl(x, species))) |>
  select(!data) |>
  unnest(cols = c(freq)) |>
  as_tibble()

## # A tibble: 9 × 4
##   island    species       n percent
##   <fct>     <fct>     <int>   <dbl>
## 1 Torgersen Adelie       52   1    
## 2 Torgersen Chinstrap     0   0    
## 3 Torgersen Gentoo        0   0    
## 4 Biscoe    Adelie       44   0.262
## 5 Biscoe    Chinstrap     0   0    
## 6 Biscoe    Gentoo      124   0.738
## 7 Dream     Adelie       56   0.452
## 8 Dream     Chinstrap    68   0.548
## 9 Dream     Gentoo        0   0

6.5.2 固定層別と複数のアウトカム

# アウトカム：species, island；層：なし
quos(species, island) |> 
  map( \(x) nest_tabyl(penguins, !!x))

## [[1]]
## # A tibble: 3 × 3
##   species       n percent
##   <fct>     <int>   <dbl>
## 1 Adelie      152   0.442
## 2 Chinstrap    68   0.198
## 3 Gentoo      124   0.360
## 
## [[2]]
## # A tibble: 3 × 3
##   island        n percent
##   <fct>     <int>   <dbl>
## 1 Biscoe      168   0.488
## 2 Dream       124   0.360
## 3 Torgersen    52   0.151

# 文字でもOK
vars <- c("species", "island")

vars |> 
  map( \(x) nest_tabyl(penguins, x)) # !!.xでも動いた

## [[1]]
## # A tibble: 3 × 3
##   species       n percent
##   <fct>     <int>   <dbl>
## 1 Adelie      152   0.442
## 2 Chinstrap    68   0.198
## 3 Gentoo      124   0.360
## 
## [[2]]
## # A tibble: 3 × 3
##   island        n percent
##   <fct>     <int>   <dbl>
## 1 Biscoe      168   0.488
## 2 Dream       124   0.360
## 3 Torgersen    52   0.151

# アウトカム：species, island；層：sex
quos(species, island) |> 
  map( \(x) nest_tabyl(penguins, !!x, sex))

## [[1]]
## # A tibble: 9 × 4
##   sex    species       n percent
##   <fct>  <fct>     <int>   <dbl>
## 1 male   Adelie       73   0.435
## 2 male   Chinstrap    34   0.202
## 3 male   Gentoo       61   0.363
## 4 female Adelie       73   0.442
## 5 female Chinstrap    34   0.206
## 6 female Gentoo       58   0.352
## 7 <NA>   Adelie        6   0.545
## 8 <NA>   Chinstrap     0   0    
## 9 <NA>   Gentoo        5   0.455
## 
## [[2]]
## # A tibble: 9 × 4
##   sex    island        n percent
##   <fct>  <fct>     <int>   <dbl>
## 1 male   Biscoe       83  0.494 
## 2 male   Dream        62  0.369 
## 3 male   Torgersen    23  0.137 
## 4 female Biscoe       80  0.485 
## 5 female Dream        61  0.370 
## 6 female Torgersen    24  0.145 
## 7 <NA>   Biscoe        5  0.455 
## 8 <NA>   Dream         1  0.0909
## 9 <NA>   Torgersen     5  0.455

6.5.2.1 同じ文字を含む列名を1つにまとめる

それぞれのtabyl結果はアウトカム変数名の列名が異なる
単純にbind_rowsすると変数名列が横に増えていく
それを回避するため，tabyl結果が入ったlistに，それぞれ同じ名前にrenamaeしていく

library(psychTools) # bfiの格納パッケージ

ac <- 
  bfi |>
  select(A1,A2,C1,C2) |> 
  names()  

# 上で定義した変数についてnest_tabyl実行
ac_res <- 
map(all_of(ac), \(x) nest_tabyl(bfi, x, gender)) |> 
  set_names(ac) # 各要素に項目名の名前を設定


# 各リスト要素の上から3つを表示
map(ac_res, \(x) head(x, 3))

## $A1
## # A tibble: 3 × 5
##   gender    A1     n percent valid_percent
##    <int> <int> <int>   <dbl>         <dbl>
## 1      1     1   202   0.220         0.220
## 2      1     2   284   0.309         0.309
## 3      1     3   160   0.174         0.174
## 
## $A2
## # A tibble: 3 × 5
##   gender    A2     n percent valid_percent
##    <int> <int> <int>   <dbl>         <dbl>
## 1      1     1    26  0.0283        0.0286
## 2      1     2    63  0.0686        0.0694
## 3      1     3    67  0.0729        0.0738
## 
## $C1
## # A tibble: 3 × 5
##   gender    C1     n percent valid_percent
##    <int> <int> <int>   <dbl>         <dbl>
## 1      1     1    23  0.0250        0.0252
## 2      1     2    53  0.0577        0.0581
## 3      1     3   103  0.112         0.113 
## 
## $C2
## # A tibble: 3 × 5
##   gender    C2     n percent valid_percent
##    <int> <int> <int>   <dbl>         <dbl>
## 1      1     1    31  0.0337        0.0340
## 2      1     2    95  0.103         0.104 
## 3      1     3   112  0.122         0.123

# 各変数名を統一した名前にrename
ac_res <- 
ac_res |> 
  map(\(x) rename_with(x,
                   \(x) "response",               # 統一名をここに
                   starts_with(c("A", "C")))
      )

# 各リスト要素の上から3つを表示
map(ac_res, \(x) head(x, 3))

## $A1
## # A tibble: 3 × 5
##   gender response     n percent valid_percent
##    <int>    <int> <int>   <dbl>         <dbl>
## 1      1        1   202   0.220         0.220
## 2      1        2   284   0.309         0.309
## 3      1        3   160   0.174         0.174
## 
## $A2
## # A tibble: 3 × 5
##   gender response     n percent valid_percent
##    <int>    <int> <int>   <dbl>         <dbl>
## 1      1        1    26  0.0283        0.0286
## 2      1        2    63  0.0686        0.0694
## 3      1        3    67  0.0729        0.0738
## 
## $C1
## # A tibble: 3 × 5
##   gender response     n percent valid_percent
##    <int>    <int> <int>   <dbl>         <dbl>
## 1      1        1    23  0.0250        0.0252
## 2      1        2    53  0.0577        0.0581
## 3      1        3   103  0.112         0.113 
## 
## $C2
## # A tibble: 3 × 5
##   gender response     n percent valid_percent
##    <int>    <int> <int>   <dbl>         <dbl>
## 1      1        1    31  0.0337        0.0340
## 2      1        2    95  0.103         0.104 
## 3      1        3   112  0.122         0.123

# 縦に連結
ac_res |> 
  bind_rows(.id = "items")

## # A tibble: 56 × 6
##    items gender response     n percent valid_percent
##    <chr>  <int>    <int> <int>   <dbl>         <dbl>
##  1 A1         1        1   202 0.220          0.220 
##  2 A1         1        2   284 0.309          0.309 
##  3 A1         1        3   160 0.174          0.174 
##  4 A1         1        4   139 0.151          0.151 
##  5 A1         1        5    99 0.108          0.108 
##  6 A1         1        6    34 0.0370         0.0370
##  7 A1         1       NA     1 0.00109       NA     
##  8 A1         2        1   720 0.383          0.386 
##  9 A1         2        2   534 0.284          0.286 
## 10 A1         2        3   242 0.129          0.130 
## # ℹ 46 more rows

6.5.2.2 複数層をみるためmtcarsで

# アウトカム：cyl, gear；層：vs
quos(cyl, gear) |> 
  map( \(x) nest_tabyl(mtcars, !!x, vs))

## [[1]]
## # A tibble: 5 × 4
##      vs   cyl     n percent
##   <dbl> <dbl> <int>   <dbl>
## 1     0     4     1  0.0556
## 2     0     6     3  0.167 
## 3     0     8    14  0.778 
## 4     1     4    10  0.714 
## 5     1     6     4  0.286 
## 
## [[2]]
## # A tibble: 6 × 4
##      vs  gear     n percent
##   <dbl> <dbl> <int>   <dbl>
## 1     0     3    12  0.667 
## 2     0     4     2  0.111 
## 3     0     5     4  0.222 
## 4     1     3     3  0.214 
## 5     1     4    10  0.714 
## 6     1     5     1  0.0714

# アウトカム：cyl, gear；層：vs x am
quos(cyl, gear) |> 
  map( \(x) nest_tabyl(mtcars, !!x, c(vs,am)))

## [[1]]
## # A tibble: 7 × 5
##      vs    am   cyl     n percent
##   <dbl> <dbl> <dbl> <int>   <dbl>
## 1     0     1     4     1   0.167
## 2     0     1     6     3   0.5  
## 3     0     1     8     2   0.333
## 4     1     1     4     7   1    
## 5     1     0     4     3   0.429
## 6     1     0     6     4   0.571
## 7     0     0     8    12   1    
## 
## [[2]]
## # A tibble: 7 × 5
##      vs    am  gear     n percent
##   <dbl> <dbl> <dbl> <int>   <dbl>
## 1     0     1     4     2   0.333
## 2     0     1     5     4   0.667
## 3     1     1     4     6   0.857
## 4     1     1     5     1   0.143
## 5     1     0     3     3   0.429
## 6     1     0     4     4   0.571
## 7     0     0     3    12   1

6.5.3 複数層別と複数アウトカム

groups <- quos(NULL, NULL, vs, am)
outcomes <- quos(cyl, gear, cyl, gear) # 層別と数をそろえるため対応する対象を繰り返す
  
  
map2(outcomes, groups,  \(x, y) nest_tabyl(mtcars, !!x, !!y))

## [[1]]
## # A tibble: 3 × 3
##     cyl     n percent
##   <dbl> <int>   <dbl>
## 1     4    11   0.344
## 2     6     7   0.219
## 3     8    14   0.438
## 
## [[2]]
## # A tibble: 3 × 3
##    gear     n percent
##   <dbl> <int>   <dbl>
## 1     3    15   0.469
## 2     4    12   0.375
## 3     5     5   0.156
## 
## [[3]]
## # A tibble: 5 × 4
##      vs   cyl     n percent
##   <dbl> <dbl> <int>   <dbl>
## 1     0     4     1  0.0556
## 2     0     6     3  0.167 
## 3     0     8    14  0.778 
## 4     1     4    10  0.714 
## 5     1     6     4  0.286 
## 
## [[4]]
## # A tibble: 4 × 4
##      am  gear     n percent
##   <dbl> <dbl> <int>   <dbl>
## 1     1     4     8   0.615
## 2     1     5     5   0.385
## 3     0     3    15   0.789
## 4     0     4     4   0.211

6.5.3.1 確認

tabyl(mtcars, vs, cyl) |> 
  adorn_percentages()

##  vs          4         6         8
##   0 0.05555556 0.1666667 0.7777778
##   1 0.71428571 0.2857143 0.0000000

6.5.4 欠損がある場合

7 rlangについて

7.1 curly-curly {{}}

引数の文字そのままだけだとエラーになるので特別な指定が必要
{{}}演算子を使えば関数の中で，データフレームの変数名を入れられる
参照
- https://www.tidyverse.org/blog/2020/02/glue-strings-and-tidy-eval/
- https://rlang.r-lib.org/reference/quasiquotation.html

7.1.1 rlang 0.4.0以降から

https://www.tidyverse.org/blog/2019/06/rlang-0-4-0/#a-simpler-interpolation-pattern-with-

mean_by <- function(df, by, var){
df |> 
  group_by({{by}}) |> 
  summarise(mean = mean({{var}}, na.rm =  TRUE))
}

mean_by(penguins, species, bill_length_mm)

## # A tibble: 3 × 2
##   species    mean
##   <fct>     <dbl>
## 1 Adelie     38.8
## 2 Chinstrap  48.8
## 3 Gentoo     47.5

7.1.1.1 {{}}は!!enquo()のショートカット

!!(bang-bang)
- 単一の環境変数（<-で作られた変数）を代入する場合に使う
enquo
- defusing演算子
  - https://rlang.r-lib.org/reference/nse-defuse.html
- base関数でいうsubstitute()
- 古い情報: https://tidyeval.tidyverse.org/sec-up-to-speed.html

mean_by2 <- function(df, by, var){
df |> 
  group_by(!!enquo(by)) |> 
  summarise(mean = mean(!!enquo(var), na.rm =  TRUE))
}

mean_by2(penguins, species, bill_length_mm)

## # A tibble: 3 × 2
##   species    mean
##   <fct>     <dbl>
## 1 Adelie     38.8
## 2 Chinstrap  48.8
## 3 Gentoo     47.5

7.1.2 作成する変数名にglueを利用する

:=はセイウチ演算子（walrus operator）というらしい

mean_by3 <- function(df, by, var, prefix = "mean"){
  df |>
    group_by({{ by }}) |>
    summarise("{prefix}_{{ var }}" := mean({{ var }}, na.rm = TRUE))
}

mean_by3(penguins, species, bill_length_mm, prefix = "mean")

## # A tibble: 3 × 2
##   species   mean_bill_length_mm
##   <fct>                   <dbl>
## 1 Adelie                   38.8
## 2 Chinstrap                48.8
## 3 Gentoo                   47.5

8 map関連

8.1 データフレーム内の複数の変数に一度に関数を適用

# 使用する変数格納
vars <-
  mtcars |> 
  select(vs,am,gear) |> 
  names()

map(vars, \(x) count(mtcars, .data[[x]])) |> 
  set_names(vars)                             # listの要素名付与

## $vs
##   vs  n
## 1  0 18
## 2  1 14
## 
## $am
##   am  n
## 1  0 19
## 2  1 13
## 
## $gear
##   gear  n
## 1    3 15
## 2    4 12
## 3    5  5

# 割合も
map(vars, \(x) count(mtcars, .data[[x]])) |> 
  set_names(vars) |> 
map(\(x) mutate(x, percent = n/sum(n)))

## $vs
##   vs  n percent
## 1  0 18  0.5625
## 2  1 14  0.4375
## 
## $am
##   am  n percent
## 1  0 19 0.59375
## 2  1 13 0.40625
## 
## $gear
##   gear  n percent
## 1    3 15 0.46875
## 2    4 12 0.37500
## 3    5  5 0.15625

8.2 リストの各要素のデータフレームで特定の変数をrename

# renameしたい変数の型を因子型に
mtcars_f <- 
  mtcars |> 
  mutate(across(all_of(vars),
                factor))

tables <- 
map(vars, \(x) count(mtcars_f, .data[[x]], .drop = FALSE)) |> 
  set_names(vars) |> 
map(\(x) mutate(x, percent = n/sum(n)))

# 各変数に対し関数を適用(tidyselect 1.2.0 >= でwarningが出るように)
# tables <- 
# map(vars, \(x) tabyl(mtcars_f, .data[[x]])) |> 
#   set_names(vars)

# 各変数のtabyl結果で連番作成
tables <- 
  map(tables, \(x) mutate(x, vid = row_number()))

# 各リスト要素の変数名の列名に文字を追加
tables <- 
tables |> 
    map(\(x) rename_with(x, 
                     \(y) str_c(y, "_test"),
                     starts_with("per")))

# purrr 1.0.0より前
# tables <- 
#   tables |> 
#   map(~rename_with(., 
#                       ~ str_c(., "_test"),
#                       starts_with("per")))

# 各リスト要素の変数名の列を同じ名前に
tables <- 
  tables |> 
  map(\(x) rename_with(x, 
                       ~ "levels",
                       where(is.factor)))

# purrr 1.0.0より前
# tables <- 
#   tables |> 
#   map(~rename_with(., 
#                       ~ "levels",
#                       where(is.factor)))

bind_rows(tables, .id = "var_name")

##   var_name levels  n percent_test vid
## 1       vs      0 18      0.56250   1
## 2       vs      1 14      0.43750   2
## 3       am      0 19      0.59375   1
## 4       am      1 13      0.40625   2
## 5     gear      3 15      0.46875   1
## 6     gear      4 12      0.37500   2
## 7     gear      5  5      0.15625   3

8.3 imap & rename_with

https://stackoverflow.com/questions/61914847/replacing-dplyrfuns-within-imap-that-refers-to-the-y-value

df_2020 <- tribble(
~id, ~var1, ~var2,
1, 1, 0,
2, 0, 1,
3, 1, 1
)

df_2021 <- tribble(
~id, ~var1, ~var2,
1, 0, 0,
2, 1, 0,
3, 1, 1
)

# データフレームをリストにまとめて要素名をつける
df_list <- 
  list(df_2020, df_2021) |> 
  set_names("y20", "y21")

# リスト内のデータフレームのid以外の変数に要素名の接尾辞をつける
df_list <- 
 imap(df_list,
       function(a,b){
         rename_with(a,
          \(x) {str_c(x, b, sep = "_")}, !id)}
      )

df_list

## $y20
## # A tibble: 3 × 3
##      id var1_y20 var2_y20
##   <dbl>    <dbl>    <dbl>
## 1     1        1        0
## 2     2        0        1
## 3     3        1        1
## 
## $y21
## # A tibble: 3 × 3
##      id var1_y21 var2_y21
##   <dbl>    <dbl>    <dbl>
## 1     1        0        0
## 2     2        1        0
## 3     3        1        1

# リスト内のデータフレームを横に連結

df_list |>
  reduce(full_join, by = "id")

## # A tibble: 3 × 5
##      id var1_y20 var2_y20 var1_y21 var2_y21
##   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
## 1     1        1        0        0        0
## 2     2        0        1        1        0
## 3     3        1        1        1        1

8.4 回帰：複数のアウトカムに同じ説明変数

# アウトカム変数名格納
outcomes <- 
  penguins |> 
  select(ends_with("_mm")) |> 
  names()
  
# 回帰
res_l <- 
  str_c(outcomes, "~ species*sex") |>
  map(\(x) lm(as.formula(x), data = penguins)) |> 
  set_names(outcomes)

# 結果の要約

  res_l |> 
  map(\(x) broom::tidy(x, conf.int = TRUE)) |> 
  bind_rows(.id = "outcome") |> 
    print(n = 18)

## # A tibble: 18 × 8
##    outcome       term  estimate std.error statistic   p.value conf.low conf.high
##    <chr>         <chr>    <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 bill_length_… (Int…  37.3       0.271    137.    2.30e-291  36.7       37.8  
##  2 bill_length_… spec…   9.32      0.481     19.4   3.06e- 56   8.37      10.3  
##  3 bill_length_… spec…   8.31      0.407     20.4   3.18e- 60   7.50       9.11 
##  4 bill_length_… sexm…   3.13      0.383      8.17  6.64e- 15   2.38       3.89 
##  5 bill_length_… spec…   1.39      0.680      2.04  4.21e-  2   0.0501     2.73 
##  6 bill_length_… spec…   0.777     0.572      1.36  1.75e-  1  -0.348      1.90 
##  7 bill_depth_mm (Int…  17.6       0.0976   180.    0          17.4       17.8  
##  8 bill_depth_mm spec…  -0.0337    0.173     -0.194 8.46e-  1  -0.374      0.307
##  9 bill_depth_mm spec…  -3.38      0.147    -23.1   1.55e- 70  -3.67      -3.10 
## 10 bill_depth_mm sexm…   1.45      0.138     10.5   1.96e- 22   1.18       1.72 
## 11 bill_depth_mm spec…   0.214     0.245      0.874 3.83e-  1  -0.268      0.696
## 12 bill_depth_mm spec…   0.0294    0.206      0.143 8.87e-  1  -0.376      0.435
## 13 flipper_leng… (Int… 188.        0.662    284.    0         186.       189.   
## 14 flipper_leng… spec…   3.94      1.17       3.36  8.84e-  4   1.63       6.25 
## 15 flipper_leng… spec…  24.9       0.995     25.0   4.93e- 78  23.0       26.9  
## 16 flipper_leng… sexm…   4.62      0.936      4.93  1.30e-  6   2.77       6.46 
## 17 flipper_leng… spec…   3.56      1.66       2.14  3.28e-  2   0.293      6.83 
## 18 flipper_leng… spec…   4.22      1.40       3.02  2.74e-  3   1.47       6.97

9 要約いろいろ

9.1 psych::describe

数値の要約には便利

psych::describe(penguins)

##                   vars   n    mean     sd  median trimmed    mad    min    max
## species*             1 344    1.92   0.89    2.00    1.90   1.48    1.0    3.0
## island*              2 344    1.66   0.73    2.00    1.58   1.48    1.0    3.0
## bill_length_mm       3 342   43.92   5.46   44.45   43.91   7.04   32.1   59.6
## bill_depth_mm        4 342   17.15   1.97   17.30   17.17   2.22   13.1   21.5
## flipper_length_mm    5 342  200.92  14.06  197.00  200.34  16.31  172.0  231.0
## body_mass_g          6 342 4201.75 801.95 4050.00 4154.01 889.56 2700.0 6300.0
## sex*                 7 333    1.50   0.50    2.00    1.51   0.00    1.0    2.0
## year                 8 344 2008.03   0.82 2008.00 2008.04   1.48 2007.0 2009.0
##                    range  skew kurtosis    se
## species*             2.0  0.16    -1.73  0.05
## island*              2.0  0.61    -0.91  0.04
## bill_length_mm      27.5  0.05    -0.89  0.30
## bill_depth_mm        8.4 -0.14    -0.92  0.11
## flipper_length_mm   59.0  0.34    -1.00  0.76
## body_mass_g       3600.0  0.47    -0.74 43.36
## sex*                 1.0 -0.02    -2.01  0.03
## year                 2.0 -0.05    -1.51  0.04

9.2 skimr

9.2.1 ヒストグラムなし

library(skimr)

skim_without_charts(penguins) |> 
  as_tibble()

## # A tibble: 8 × 14
##   skim_type skim_variable n_missing complete_rate factor.ordered factor.n_unique
##   <chr>     <chr>             <int>         <dbl> <lgl>                    <int>
## 1 factor    species               0         1     FALSE                        3
## 2 factor    island                0         1     FALSE                        3
## 3 factor    sex                  11         0.968 FALSE                        2
## 4 numeric   bill_length_…         2         0.994 NA                          NA
## 5 numeric   bill_depth_mm         2         0.994 NA                          NA
## 6 numeric   flipper_leng…         2         0.994 NA                          NA
## 7 numeric   body_mass_g           2         0.994 NA                          NA
## 8 numeric   year                  0         1     NA                          NA
## # ℹ 8 more variables: factor.top_counts <chr>, numeric.mean <dbl>,
## #   numeric.sd <dbl>, numeric.p0 <dbl>, numeric.p25 <dbl>, numeric.p50 <dbl>,
## #   numeric.p75 <dbl>, numeric.p100 <dbl>

9.2.2 型を限定

skim(penguins) |> 
yank("numeric") |> 
  as_tibble()

## # A tibble: 5 × 11
##   skim_variable     n_missing complete_rate   mean      sd     p0    p25    p50
##   <chr>                 <int>         <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
## 1 bill_length_mm            2         0.994   43.9   5.46    32.1   39.2   44.4
## 2 bill_depth_mm             2         0.994   17.2   1.97    13.1   15.6   17.3
## 3 flipper_length_mm         2         0.994  201.   14.1    172    190    197  
## 4 body_mass_g               2         0.994 4202.  802.    2700   3550   4050  
## 5 year                      0         1     2008.    0.818 2007   2007   2008  
## # ℹ 3 more variables: p75 <dbl>, p100 <dbl>, hist <chr>

9.2.3 nを表示

skimn <- skim_with(base = sfl(n = \(x) sum(!is.na(x))))

skimn(penguins)

Data summary
Name	penguins
Number of rows	344
Number of columns	8
_______________________
Column type frequency:
factor	3
numeric	5
________________________
Group variables	None

Variable type: factor

skim_variable	n	ordered	n_unique	top_counts
species	344	FALSE	3	Ade: 152, Gen: 124, Chi: 68
island	344	FALSE	3	Bis: 168, Dre: 124, Tor: 52
sex	333	FALSE	2	mal: 168, fem: 165

Variable type: numeric

skim_variable	n	mean	sd	p0	p25	p50	p75	p100	hist
bill_length_mm	342	43.92	5.46	32.1	39.23	44.45	48.5	59.6	▃▇▇▆▁
bill_depth_mm	342	17.15	1.97	13.1	15.60	17.30	18.7	21.5	▅▅▇▇▂
flipper_length_mm	342	200.92	14.06	172.0	190.00	197.00	213.0	231.0	▂▇▃▅▂
body_mass_g	342	4201.75	801.95	2700.0	3550.00	4050.00	4750.0	6300.0	▃▇▆▃▂
year	344	2008.03	0.82	2007.0	2007.00	2008.00	2009.0	2009.0	▇▁▇▁▇

9.2.4 層別

penguins |> 
  group_by(sex) |> 
  skim() |> 
  as_tibble()

## # A tibble: 21 × 16
##    skim_type skim_variable  sex    n_missing complete_rate factor.ordered
##    <chr>     <chr>          <fct>      <int>         <dbl> <lgl>         
##  1 factor    species        female         0         1     FALSE         
##  2 factor    species        male           0         1     FALSE         
##  3 factor    species        <NA>           0         1     FALSE         
##  4 factor    island         female         0         1     FALSE         
##  5 factor    island         male           0         1     FALSE         
##  6 factor    island         <NA>           0         1     FALSE         
##  7 numeric   bill_length_mm female         0         1     NA            
##  8 numeric   bill_length_mm male           0         1     NA            
##  9 numeric   bill_length_mm <NA>           2         0.818 NA            
## 10 numeric   bill_depth_mm  female         0         1     NA            
## # ℹ 11 more rows
## # ℹ 10 more variables: factor.n_unique <int>, factor.top_counts <chr>,
## #   numeric.mean <dbl>, numeric.sd <dbl>, numeric.p0 <dbl>, numeric.p25 <dbl>,
## #   numeric.p50 <dbl>, numeric.p75 <dbl>, numeric.p100 <dbl>,
## #   numeric.hist <chr>

9.2.4.1 層別に特定の変数の平均とSD

penguins |> 
  group_by(species, sex) |> 
  skim(bill_length_mm, bill_depth_mm) |> # selectを先にもってきてもよい
  as_tibble()

## # A tibble: 16 × 14
##    skim_type skim_variable  species   sex   n_missing complete_rate numeric.mean
##    <chr>     <chr>          <fct>     <fct>     <int>         <dbl>        <dbl>
##  1 numeric   bill_length_mm Adelie    fema…         0         1             37.3
##  2 numeric   bill_length_mm Adelie    male          0         1             40.4
##  3 numeric   bill_length_mm Adelie    <NA>          1         0.833         37.8
##  4 numeric   bill_length_mm Chinstrap fema…         0         1             46.6
##  5 numeric   bill_length_mm Chinstrap male          0         1             51.1
##  6 numeric   bill_length_mm Gentoo    fema…         0         1             45.6
##  7 numeric   bill_length_mm Gentoo    male          0         1             49.5
##  8 numeric   bill_length_mm Gentoo    <NA>          1         0.8           45.6
##  9 numeric   bill_depth_mm  Adelie    fema…         0         1             17.6
## 10 numeric   bill_depth_mm  Adelie    male          0         1             19.1
## 11 numeric   bill_depth_mm  Adelie    <NA>          1         0.833         18.3
## 12 numeric   bill_depth_mm  Chinstrap fema…         0         1             17.6
## 13 numeric   bill_depth_mm  Chinstrap male          0         1             19.3
## 14 numeric   bill_depth_mm  Gentoo    fema…         0         1             14.2
## 15 numeric   bill_depth_mm  Gentoo    male          0         1             15.7
## 16 numeric   bill_depth_mm  Gentoo    <NA>          1         0.8           14.6
## # ℹ 7 more variables: numeric.sd <dbl>, numeric.p0 <dbl>, numeric.p25 <dbl>,
## #   numeric.p50 <dbl>, numeric.p75 <dbl>, numeric.p100 <dbl>,
## #   numeric.hist <chr>

9.2.4.2 層別にnを表示

https://github.com/ropensci/skimr/issues/505

my_skim <- skim_with(base = sfl(n = length))

penguins |> 
  group_by(species, sex) |> 
  my_skim(bill_length_mm) |>
  as_tibble()

## # A tibble: 8 × 13
##   skim_type skim_variable species sex       n numeric.mean numeric.sd numeric.p0
##   <chr>     <chr>         <fct>   <fct> <int>        <dbl>      <dbl>      <dbl>
## 1 numeric   bill_length_… Adelie  fema…    73         37.3       2.03       32.1
## 2 numeric   bill_length_… Adelie  male     73         40.4       2.28       34.6
## 3 numeric   bill_length_… Adelie  <NA>      6         37.8       2.80       34.1
## 4 numeric   bill_length_… Chinst… fema…    34         46.6       3.11       40.9
## 5 numeric   bill_length_… Chinst… male     34         51.1       1.56       48.5
## 6 numeric   bill_length_… Gentoo  fema…    58         45.6       2.05       40.9
## 7 numeric   bill_length_… Gentoo  male     61         49.5       2.72       44.4
## 8 numeric   bill_length_… Gentoo  <NA>      5         45.6       1.37       44.5
## # ℹ 5 more variables: numeric.p25 <dbl>, numeric.p50 <dbl>, numeric.p75 <dbl>,
## #   numeric.p100 <dbl>, numeric.hist <chr>

9.2.5 longに

to_long(penguins)

## # A tibble: 65 × 4
##    skim_type skim_variable     stat          formatted
##    <chr>     <chr>             <chr>         <chr>    
##  1 factor    species           n_missing     0        
##  2 factor    island            n_missing     0        
##  3 factor    sex               n_missing     11       
##  4 numeric   bill_length_mm    n_missing     2        
##  5 numeric   bill_depth_mm     n_missing     2        
##  6 numeric   flipper_length_mm n_missing     2        
##  7 numeric   body_mass_g       n_missing     2        
##  8 numeric   year              n_missing     0        
##  9 factor    species           complete_rate 1        
## 10 factor    island            complete_rate 1        
## # ℹ 55 more rows

9.3 Data Explorer

https://rpubs.com/mark_sch7/DataExplorerPackage

# devtools::install_github("boxuancui/DataExplorer")
library(DataExplorer)

9.3.1 グラフ

9.3.1.1 頻度の棒グラフ

plot_bar(penguins)

9.3.1.2 ヒストグラム

plot_histogram(penguins)

10 数字

10.1 eの表記を数値に

浮動小数点表記法 floating point expression あるいは，科学的表記法 scientific notation
- http://aoki2.si.gunma-u.ac.jp/Hanasi/StatTalk/fudoushousuu.html

bnum <- 10000000000

bnum

## [1] 1e+10

format(bnum, scientific = FALSE)

## [1] "10000000000"

10.1.1 設定自体を変更

options(scipen=999)

bnum

## [1] 10000000000

# 戻す
options(scipen = 0)

bnum

## [1] 1e+10

10.2 小数点

10.2.1 四捨五入のルール

http://cse.naro.affrc.go.jp/takezawa/r-tips/r/37.html

round(1.5)

## [1] 2

round(2.5)

## [1] 2

10.2.2 切り捨てと切り上げ

round(0.123456, 5) # 普通の丸め

## [1] 0.12346

round(0.123454, 5) # 普通の丸め

## [1] 0.12345

# library(plyr) # dplyr読んだ後に読み込むとdplyrの関数で動かなくなるのがあるので読むならtidyverseの先に
plyr::round_any(0.123456, 0.00001, floor)   # 切り捨て

## [1] 0.12345

plyr::round_any(0.123454, 0.00001, ceiling) # 切り上げ

## [1] 0.12346

10.2.2.1 切り捨てをtruncで工夫（整数にして割り戻す）

trunc(0.123456*10^5)/10^5

## [1] 0.12345

10.2.2.2 mutateのなかで使う

# 小数第2位まで出ている変数
mtcars |> 
  select(drat, wt)

##                     drat    wt
## Mazda RX4           3.90 2.620
## Mazda RX4 Wag       3.90 2.875
## Datsun 710          3.85 2.320
## Hornet 4 Drive      3.08 3.215
## Hornet Sportabout   3.15 3.440
## Valiant             2.76 3.460
## Duster 360          3.21 3.570
## Merc 240D           3.69 3.190
## Merc 230            3.92 3.150
## Merc 280            3.92 3.440
## Merc 280C           3.92 3.440
## Merc 450SE          3.07 4.070
## Merc 450SL          3.07 3.730
## Merc 450SLC         3.07 3.780
## Cadillac Fleetwood  2.93 5.250
## Lincoln Continental 3.00 5.424
## Chrysler Imperial   3.23 5.345
## Fiat 128            4.08 2.200
## Honda Civic         4.93 1.615
## Toyota Corolla      4.22 1.835
## Toyota Corona       3.70 2.465
## Dodge Challenger    2.76 3.520
## AMC Javelin         3.15 3.435
## Camaro Z28          3.73 3.840
## Pontiac Firebird    3.08 3.845
## Fiat X1-9           4.08 1.935
## Porsche 914-2       4.43 2.140
## Lotus Europa        3.77 1.513
## Ford Pantera L      4.22 3.170
## Ferrari Dino        3.62 2.770
## Maserati Bora       3.54 3.570
## Volvo 142E          4.11 2.780

# 変数上書き

10.2.2.2.1 変数上書き

mtcars |> 
  mutate(across(c(drat,wt),
                \(x) plyr::round_any(x, 0.1, floor))) |> 
  select(drat, wt) |> 
  head()

##                   drat  wt
## Mazda RX4          3.9 2.6
## Mazda RX4 Wag      3.9 2.8
## Datsun 710         3.8 2.3
## Hornet 4 Drive     3.0 3.2
## Hornet Sportabout  3.1 3.4
## Valiant            2.7 3.4

10.2.2.2.2 変数追加

mtcars |> 
  select(drat, wt) |>
  mutate(across(c(drat,wt),
                list(fl = \(x) plyr::round_any(x, 0.1, floor)))) |> 
    head()

##                   drat    wt drat_fl wt_fl
## Mazda RX4         3.90 2.620     3.9   2.6
## Mazda RX4 Wag     3.90 2.875     3.9   2.8
## Datsun 710        3.85 2.320     3.8   2.3
## Hornet 4 Drive    3.08 3.215     3.0   3.2
## Hornet Sportabout 3.15 3.440     3.1   3.4
## Valiant           2.76 3.460     2.7   3.4

10.2.2.2.3 trunc

mtcars |> 
  mutate(across(c(drat,wt),
                \(x) trunc(x*10)/10)) |> 
  select(drat, wt) |> 
  head()

##                   drat  wt
## Mazda RX4          3.9 2.6
## Mazda RX4 Wag      3.9 2.8
## Datsun 710         3.8 2.3
## Hornet 4 Drive     3.0 3.2
## Hornet Sportabout  3.1 3.4
## Valiant            2.7 3.4

11 文字

11.1 str_replace

置換する文字対の一覧を名前付きベクトルで作成

moji_vec <- 
 as.character(1:3) |>                 # ベクトルの値: 変更先の文字
  set_names(c("one", "two", "three")) # 名前: 元の文字

moji_vec

##   one   two three 
##   "1"   "2"   "3"

x <- c("item_one", "item_two", "item_three", "item_two_three")
x

## [1] "item_one"       "item_two"       "item_three"     "item_two_three"

str_replace_all(x, 
            moji_vec)

## [1] "item_1"   "item_2"   "item_3"   "item_2_3"

11.2 str_squish テキスト中の複数空白を1つに

文字列の中に複数空白があっても1つだけにする

str_squish("あ　い　　う　　　　え　　　　　　　お")

## [1] "あ い う え お"

11.3 str_replace 特定の文字列以降を削除

word <- "このテキストより後を削除"

str_replace("文字列　このテキストより後を削除　　削除部分",
           str_c(word, ".*"), word)

## [1] "文字列　このテキストより後を削除"

11.4 変数名, 列名, column

11.4.1 出力いろいろ

11.4.1.1 そのまま文字ベクトルとして

vars <- 
  penguins |>
  names()

vars

## [1] "species"           "island"            "bill_length_mm"   
## [4] "bill_depth_mm"     "flipper_length_mm" "body_mass_g"      
## [7] "sex"               "year"

11.4.1.2 文字ベクトルの入力時の形式として

dput(vars)

## c("species", "island", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", 
## "body_mass_g", "sex", "year")

constructiveパッケージ

constructive::construct(vars)

## c(
##   "species", "island", "bill_length_mm", "bill_depth_mm", "flipper_length_mm",
##   "body_mass_g", "sex", "year"
## )

11.4.1.3 1つの文字列として

cat(vars)

## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year

11.4.1.4 ” “で囲まないベクトルとして

noquote(vars)

## [1] species           island            bill_length_mm    bill_depth_mm    
## [5] flipper_length_mm body_mass_g       sex               year

https://cynkra.github.io/constructive/

12 list

12.1 長さの違う要素を持つlistを2列のデータフレームにまとめる

https://stackoverflow.com/questions/68855260/turn-a-list-with-elements-of-unequal-length-into-a-two-column-dataframe-in-r-dp
色々方法があるがutils::stack()が早い

test_l <- 
  list(a = c(1,2),
       b = c(1,2,3),
       c = c(1,2,3,4))

stack(test_l)

##   values ind
## 1      1   a
## 2      2   a
## 3      1   b
## 4      2   b
## 5      3   b
## 6      1   c
## 7      2   c
## 8      3   c
## 9      4   c

13 データフレーム

13.1 差分の確認

joinした後に.xと.yの接尾辞がついた変数の差分作成

13.1.1 joinする2つのデータフレーム作成

p1 <-
  penguins |>
    select(species, starts_with("bill")) |> 
    slice(1:3) |> 
    mutate(id = row_number())
  
p2 <-
  penguins |>
    select(species, starts_with("bill")) |> 
    slice(4:6) |> 
    mutate(id = row_number())

p1_2 <-
  p1 |>
    left_join(p2, by = join_by(id, species))

p1_2

## # A tibble: 3 × 6
##   species bill_length_mm.x bill_depth_mm.x    id bill_length_mm.y
##   <fct>              <dbl>           <dbl> <int>            <dbl>
## 1 Adelie              39.1            18.7     1             NA  
## 2 Adelie              39.5            17.4     2             36.7
## 3 Adelie              40.3            18       3             39.3
## # ℹ 1 more variable: bill_depth_mm.y <dbl>

13.1.2 関数を作成して実行

参考：https://github.com/tidyverse/dplyr/issues/4834

# 対象となる変数名のベクトル作成
var_name <- 
  penguins |>
    select(starts_with("bill")) |> 
    names()
  
  
# 差分を計算する関数作成

make_dif <- function(df, var){
  df |>
    mutate("dif_,{var}" :=
             !!sym(str_c(var, ".x")) - !!sym(str_c(var, ".y"))
    )
}

make_dif2 <- function(df, var){
  df |>
    mutate("dif_{var}" := 
             !!sym(str_c(var, ".x")) - !!sym(str_c(var, ".y"))
    )
}

# 動く
# make_dif <- function(df, var){
#     df |>
#       mutate(!!sym(str_c("dif_",{{var}})) :=
#                     !!sym(str_c(var, ".x")) - !!sym(str_c(var, ".y")))
#   }
# 
# make_dif2 <- function(df, var){
#     df |>
#       mutate("dif_{{var}}" := !!sym(str_c(var, ".x")) - !!sym(str_c(var, ".y")))
#   }

# 動く
# make_dif <- function(df, var){
#     df |>
#       mutate(!!sym(str_c("dif_",{{var}})) :=
#                     !!sym(str_c({{var}}, ".x")) - !!sym(str_c({{var}}, ".y")))
#   }
# 
# make_dif2 <- function(df, var){
#     df |>
#       mutate("dif_{{var}}" := !!sym(str_c({{var}}, ".x")) - !!sym(str_c({{var}}, ".y")))
#   }

# 動く
# make_di2 <- function(df, var){
# var1 <- sym(str_c({{var}}, ".x"))
# var2 <- sym(str_c({{var}}, ".y"))
#       df |>
#       mutate("dif_{{var}}" := !!var1 - !!var2)
#   }

# 動く
# make_di2 <- function(df, var){
# var1 <- sym("{{var}}.x")
# var2 <- sym("{{var}}.y")
#       df |>
#       mutate("dif_{{var}}" := !!var1 - !!var2)
#   }


# 動かない
# make_dif3 <- function(df, var){
# df |> 
#   mutate("dif_{{var}}" := !!sym("{{var}}.x") - !!sym("{{var}}.y"))
# }

# 動かない
# make_dif3 <- function(df, var){
#   form <- "{{var}}.x - {{var}}.y"
# df |>
#   mutate("dif_{{var}}" := !! rlang::parse_expr(form) )
# }


  
p1_2 |>
  reduce(var_name, make_dif, .init = _) |> 
  select(id, species, contains("bill_length"), contains("bill_depth")) |> 
    glimpse()

## Rows: 3
## Columns: 8
## $ id                    <int> 1, 2, 3
## $ species               <fct> Adelie, Adelie, Adelie
## $ bill_length_mm.x      <dbl> 39.1, 39.5, 40.3
## $ bill_length_mm.y      <dbl> NA, 36.7, 39.3
## $ `dif_,bill_length_mm` <dbl> NA, 2.8, 1.0
## $ bill_depth_mm.x       <dbl> 18.7, 17.4, 18.0
## $ bill_depth_mm.y       <dbl> NA, 19.3, 20.6
## $ `dif_,bill_depth_mm`  <dbl> NA, -1.9, -2.6

p1_2 |>
  reduce(var_name, make_dif2, .init = _) |> 
  select(id, species, contains("bill_length"), contains("bill_depth")) |> 
    glimpse()

## Rows: 3
## Columns: 8
## $ id                 <int> 1, 2, 3
## $ species            <fct> Adelie, Adelie, Adelie
## $ bill_length_mm.x   <dbl> 39.1, 39.5, 40.3
## $ bill_length_mm.y   <dbl> NA, 36.7, 39.3
## $ dif_bill_length_mm <dbl> NA, 2.8, 1.0
## $ bill_depth_mm.x    <dbl> 18.7, 17.4, 18.0
## $ bill_depth_mm.y    <dbl> NA, 19.3, 20.6
## $ dif_bill_depth_mm  <dbl> NA, -1.9, -2.6

# p1_2 |>
#   reduce(var_name, make_dif3, .init = .) |>
#   select(id, species, contains("bill_length"), contains("bill_depth")) |>
#     glimpse()

13.1.3 longにしてから差分作成

r-wakalangで教えてもらった
サンプルサイズが大きすぎなければこちらの方がシンプル

p1_long <- 
  pivot_longer(p1, !c(species, id), 
               values_to = "old")

p2_long <- 
  pivot_longer(p2, !c(species, id), 
               values_to = "new")

p1_2_long <-
  p1_long |>
    left_join(p2_long, by = c("id", "species", "name"))
  
p1_2_long |>
   mutate(diff = old - new) |>
    pivot_wider(
      names_from = name,
      values_from = old:diff,
      names_glue = "{name}_{.value}"
    )  |> 
  select(id, species, contains("bill_length"), contains("bill_depth")) |>   
    glimpse()

## Rows: 3
## Columns: 8
## $ id                  <int> 1, 2, 3
## $ species             <fct> Adelie, Adelie, Adelie
## $ bill_length_mm_old  <dbl> 39.1, 39.5, 40.3
## $ bill_length_mm_new  <dbl> NA, 36.7, 39.3
## $ bill_length_mm_diff <dbl> NA, 2.8, 1.0
## $ bill_depth_mm_old   <dbl> 18.7, 17.4, 18.0
## $ bill_depth_mm_new   <dbl> NA, 19.3, 20.6
## $ bill_depth_mm_diff  <dbl> NA, -1.9, -2.6

13.2 再現可能なデータ例の作成

constructive::construct(penguins |> head(2))

## tibble::tibble(
##   species = factor(c("Adelie", "Adelie"), levels = c("Adelie", "Chinstrap", "Gentoo")),
##   island = factor(c("Torgersen", "Torgersen"), levels = c("Biscoe", "Dream", "Torgersen")),
##   bill_length_mm = c(39.1, 39.5),
##   bill_depth_mm = c(18.7, 17.4),
##   flipper_length_mm = c(181L, 186L),
##   body_mass_g = c(3750L, 3800L),
##   sex = factor(c("male", "female")),
##   year = c(2007L, 2007L),
## )

14 データのサイズ

14.1 Rで使用しているメモリ量

pryr::mem_used()

## 129 MB

14.2 オブジェクトのサイズ

test <- c(1:10000)
pryr::object_size(test)

## 40.05 kB

15 RCTの解析

パッケージHSAUR3をインストール

15.1 データ

btheb <- 
  HSAUR3::BtheB |> 
  clean_names() |>
  as_tibble()

# id付ける
btheb <- 
  btheb |> 
  rowid_to_column(var = "id") 

btheb

## # A tibble: 100 × 9
##       id drug  length treatment bdi_pre bdi_2m bdi_3m bdi_5m bdi_8m
##    <int> <fct> <fct>  <fct>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1     1 No    >6m    TAU            29      2      2     NA     NA
##  2     2 Yes   >6m    BtheB          32     16     24     17     20
##  3     3 Yes   <6m    TAU            25     20     NA     NA     NA
##  4     4 No    >6m    BtheB          21     17     16     10      9
##  5     5 Yes   >6m    BtheB          26     23     NA     NA     NA
##  6     6 Yes   <6m    BtheB           7      0      0      0      0
##  7     7 Yes   <6m    TAU            17      7      7      3      7
##  8     8 No    >6m    TAU            20     20     21     19     13
##  9     9 Yes   <6m    BtheB          18     13     14     20     11
## 10    10 Yes   >6m    BtheB          20      5      5      8     12
## # ℹ 90 more rows

15.2 wideからlongに

btheb_long <- 
  btheb |> 
  pivot_longer(!c(id:treatment),
               names_to = "time",
               names_prefix = "bdi_", # timeの値から"bdi_"を削除
               values_to = "bdi")

# timeをfactorにして順序指定
btheb_long <- 
  btheb_long |> 
  mutate(time = fct_relevel(time, "pre", "2m", "3m", "5m"))



btheb_long

## # A tibble: 500 × 6
##       id drug  length treatment time    bdi
##    <int> <fct> <fct>  <fct>     <fct> <dbl>
##  1     1 No    >6m    TAU       pre      29
##  2     1 No    >6m    TAU       2m        2
##  3     1 No    >6m    TAU       3m        2
##  4     1 No    >6m    TAU       5m       NA
##  5     1 No    >6m    TAU       8m       NA
##  6     2 Yes   >6m    BtheB     pre      32
##  7     2 Yes   >6m    BtheB     2m       16
##  8     2 Yes   >6m    BtheB     3m       24
##  9     2 Yes   >6m    BtheB     5m       17
## 10     2 Yes   >6m    BtheB     8m       20
## # ℹ 490 more rows

15.3 mixed model

15.3.1 mixed modelの解説

Brown, V. A. (2021). An Introduction to Linear Mixed-Effects Modeling in R. Advances in Methods and Practices in Psychological Science, 4(1), 2515245920960351.
GLMM FAQ

15.3.2 モデル作成と実行

mm <- 
  lme4::lmer(bdi ~ treatment*time + (1 | id), 
             data = btheb_long)

summary(mm)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bdi ~ treatment * time + (1 | id)
##    Data: btheb_long
## 
## REML criterion at convergence: 2632.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1787 -0.5372 -0.1079  0.4510  3.3967 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 83.59    9.143   
##  Residual             36.44    6.037   
## Number of obs: 380, groups:  id, 100
## 
## Fixed effects:
##                       Estimate Std. Error t value
## (Intercept)             24.188      1.581  15.295
## treatmentBtheB          -1.649      2.193  -0.752
## time2m                  -4.497      1.267  -3.551
## time3m                  -6.115      1.375  -4.448
## time5m                  -7.585      1.485  -5.108
## time8m                 -10.454      1.566  -6.676
## treatmentBtheB:time2m   -3.330      1.734  -1.920
## treatmentBtheB:time3m   -3.573      1.922  -1.859
## treatmentBtheB:time5m   -3.284      2.088  -1.573
## treatmentBtheB:time8m   -1.181      2.172  -0.543
## 
## Correlation of Fixed Effects:
##             (Intr) trtmBB time2m time3m time5m time8m trBB:2 trBB:3 trBB:5
## tretmntBthB -0.721                                                        
## time2m      -0.379  0.273                                                 
## time3m      -0.349  0.252  0.456                                          
## time5m      -0.323  0.233  0.422  0.430                                   
## time8m      -0.307  0.221  0.401  0.408  0.408                            
## trtmntBtB:2  0.277 -0.384 -0.731 -0.333 -0.309 -0.293                     
## trtmntBtB:3  0.250 -0.346 -0.326 -0.715 -0.308 -0.292  0.449              
## trtmntBtB:5  0.230 -0.319 -0.300 -0.306 -0.711 -0.290  0.413  0.423       
## trtmntBtB:8  0.221 -0.306 -0.289 -0.294 -0.294 -0.721  0.397  0.406  0.407

15.3.3 tidyな出力

broom.mixed::tidy(mm, conf.int = TRUE)

## # A tibble: 12 × 8
##    effect   group    term        estimate std.error statistic conf.low conf.high
##    <chr>    <chr>    <chr>          <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 fixed    <NA>     (Intercept)    24.2       1.58    15.3      21.1    27.3   
##  2 fixed    <NA>     treatmentB…    -1.65      2.19    -0.752    -5.95    2.65  
##  3 fixed    <NA>     time2m         -4.50      1.27    -3.55     -6.98   -2.01  
##  4 fixed    <NA>     time3m         -6.11      1.37    -4.45     -8.81   -3.42  
##  5 fixed    <NA>     time5m         -7.58      1.49    -5.11    -10.5    -4.67  
##  6 fixed    <NA>     time8m        -10.5       1.57    -6.68    -13.5    -7.38  
##  7 fixed    <NA>     treatmentB…    -3.33      1.73    -1.92     -6.73    0.0686
##  8 fixed    <NA>     treatmentB…    -3.57      1.92    -1.86     -7.34    0.193 
##  9 fixed    <NA>     treatmentB…    -3.28      2.09    -1.57     -7.38    0.808 
## 10 fixed    <NA>     treatmentB…    -1.18      2.17    -0.543    -5.44    3.08  
## 11 ran_pars id       sd__(Inter…     9.14     NA       NA        NA      NA     
## 12 ran_pars Residual sd__Observ…     6.04     NA       NA        NA      NA

15.3.4 icc

lme4::VarCorr(mm) |>
  as_tibble() |>
  mutate(icc = vcov/sum(vcov))

## # A tibble: 2 × 6
##   grp      var1        var2   vcov sdcor   icc
##   <chr>    <chr>       <chr> <dbl> <dbl> <dbl>
## 1 id       (Intercept) <NA>   83.6  9.14 0.696
## 2 Residual <NA>        <NA>   36.4  6.04 0.304

15.3.5 グラフ

btheb_long |> 
    mutate(id = factor(id)) |> 
    filter(id %in% c(1:20)) |> # IDが1~20までに限定
ggplot(aes(time, bdi, 
           group = id)) + # color = id
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(size = 1) +
  theme_minimal() +
  facet_wrap(vars(treatment))

15.3.6 対比 contrast

emmeans::emmeans(mm, specs = trt.vs.ctrl ~ time:treatment, adjust = "none")

## $emmeans
##  time treatment emmean   SE  df lower.CL upper.CL
##  pre  TAU         24.2 1.58 151    21.06     27.3
##  2m   TAU         19.7 1.61 158    16.51     22.9
##  3m   TAU         18.1 1.69 186    14.73     21.4
##  5m   TAU         16.6 1.79 216    13.08     20.1
##  8m   TAU         13.7 1.85 239    10.08     17.4
##  pre  BtheB       22.5 1.52 151    19.54     25.5
##  2m   BtheB       14.7 1.52 151    11.71     17.7
##  3m   BtheB       12.9 1.65 193     9.60     16.1
##  5m   BtheB       11.7 1.75 229     8.22     15.1
##  8m   BtheB       10.9 1.78 240     7.39     14.4
## 
## Degrees-of-freedom method: satterthwaite 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast            estimate   SE  df t.ratio p.value
##  2m TAU - pre TAU       -4.50 1.27 275  -3.551  0.0005
##  3m TAU - pre TAU       -6.11 1.37 279  -4.448  <.0001
##  5m TAU - pre TAU       -7.58 1.49 280  -5.108  <.0001
##  8m TAU - pre TAU      -10.45 1.57 280  -6.676  <.0001
##  pre BtheB - pre TAU    -1.65 2.19 151  -0.752  0.4532
##  2m BtheB - pre TAU     -9.48 2.19 151  -4.321  <.0001
##  3m BtheB - pre TAU    -11.34 2.28 171  -4.967  <.0001
##  5m BtheB - pre TAU    -12.52 2.36 190  -5.309  <.0001
##  8m BtheB - pre TAU    -13.28 2.38 196  -5.576  <.0001
## 
## Degrees-of-freedom method: satterthwaite

# 邪道だけど，絶対どうしてもp値を求められている場合
# lmerTest::lmer(bdi ~ treatment*time + (1 | id), 
#               data = btheb_long) |>
#   anova()

15.4 cohen’s d

EMAtools::lme.dscore()
- lmerTestの結果のtとdfから算出

\[ d = \frac{2t}{\sqrt{df}} \]

15.4.1 手動で確認

mmt <- 
  lmerTest::lmer(bdi ~ treatment*time + (1 | id), 
             data = btheb_long)

# 交互作用項だけに
summary(mmt)$coefficients |> 
  as_tibble(rownames = "terms") |> 
  filter(terms == "treatmentBtheB:time8m")

## # A tibble: 1 × 6
##   terms                 Estimate `Std. Error`    df `t value` `Pr(>|t|)`
##   <chr>                    <dbl>        <dbl> <dbl>     <dbl>      <dbl>
## 1 treatmentBtheB:time8m    -1.18         2.17  280.    -0.543      0.587

(2*-0.543)/sqrt(280) = -0.0649009

15.4.2 参考：通常の群間のCohen’s d

15.4.2.1 平均値

btheb |> 
  group_by(treatment) |> 
  summarise(mean = mean(bdi_8m, na.rm = TRUE),
            sd = sd(bdi_8m, na.rm = TRUE),
            n = sum(!is.na(bdi_8m)))

## # A tibble: 2 × 4
##   treatment  mean    sd     n
##   <fct>     <dbl> <dbl> <int>
## 1 TAU       13.6  11.5     25
## 2 BtheB      8.85  6.09    27

15.4.2.2 Cohen’s d

15.4.2.2.1 compute.es

compute.es::mes(13.6, 8.85, 11.5, 6.09, 25, 27)

## Mean Differences ES: 
##  
##  d [ 95 %CI] = 0.52 [ -0.03 , 1.08 ] 
##   var(d) = 0.08 
##   p-value(d) = 0.07 
##   U3(d) = 69.92 % 
##   CLES(d) = 64.4 % 
##   Cliff's Delta = 0.29 
##  
##  g [ 95 %CI] = 0.51 [ -0.03 , 1.06 ] 
##   var(g) = 0.08 
##   p-value(g) = 0.07 
##   U3(g) = 69.65 % 
##   CLES(g) = 64.19 % 
##  
##  Correlation ES: 
##  
##  r [ 95 %CI] = 0.26 [ -0.02 , 0.5 ] 
##   var(r) = 0.02 
##   p-value(r) = 0.07 
##  
##  z [ 95 %CI] = 0.26 [ -0.02 , 0.54 ] 
##   var(z) = 0.02 
##   p-value(z) = 0.07 
##  
##  Odds Ratio ES: 
##  
##  OR [ 95 %CI] = 2.58 [ 0.95 , 7.03 ] 
##   p-value(OR) = 0.07 
##  
##  Log OR [ 95 %CI] = 0.95 [ -0.06 , 1.95 ] 
##   var(lOR) = 0.26 
##   p-value(Log OR) = 0.07 
##  
##  Other: 
##  
##  NNT = 5.73 
##  Total N = 52

15.4.2.2.2 effectsize

effectsize::cohens_d(bdi_8m ~ treatment, data = btheb)

## Cohen's d |        95% CI
## -------------------------
## 0.52      | [-0.03, 1.07]
## 
## - Estimated using pooled SD.

15.5 Jacobson-Truax and reliable change indices

Evans, C., Margison, F., & Barkham, M. (1998). The contribution of reliable and clinically significant change methods to evidence-based mental health. Evidence-Based Mental Health, 1(3), 70-72.
パッケージ
- https://github.com/AWKruijt/JT-RCI
BDI-IIの情報
- 再検査信頼性:0.75
  - Erford, B. T., Johnson, E., & Bardoshi, G. (2016). Meta-analysis of the English version of the Beck depression inventory–second edition. Measurement and Evaluation in Counseling and Development, 49(1), 3-33.
- 平均値:13.7, SD:12.2
  - U.S. general population by Internet panel survey providers
  - Choi, S. W., Schalet, B., Cook, K. F., & Cella, D. (2014). Establishing a common metric for depressive symptoms: linking the BDI-II, CES-D, and PHQ-9 to PROMIS depression. Psychological assessment, 26(2), 513–527. https://doi.org/10.1037/a0035768

# data.frame型にする必要あり
btheb <- 
  btheb |> 
  as.data.frame()

# インストール
# devtools::install_github("AWKruijt/JT-RCI")

# jacobson & truaxのreliable and clinically significant change
JTRCI::JTRCI(data = btheb, 
      pre = "bdi_pre", post = "bdi_8m",
      group = "treatment",
      reliability = .75,
      JTcrit = "C",
      normM = 13.7, normSD = 12.2 # 健康な集団の平均値とSD,
      # dysfM = , dysfSD = 
        )

##    Jacobson-Truax classification    TAU  BtheB
##                           <char> <char> <char>
## 1:                  deteriorated      0      0
## 2:                     unchanged      7      2
## 3:                      improved      1      0
## 4:        non reliably recovered      8     15
## 5:                     recovered      9     10

15.5.1 個別の結果を表示

preとpostで欠損がリストワイズされてることに注意
RCI > 1.96　がよい
RCI = change_abs/Sdiff

JTRCIdf |> head()

##   ppid group pre post change_abs SEmeasurement    Sdiff    crittype critval
## 1    1 BtheB  32   20        -12      4.804502 6.794592 criterion C 18.9132
## 2    2 BtheB  21    9        -12      4.804502 6.794592 criterion C 18.9132
## 3    3 BtheB   7    0         -7      4.804502 6.794592 criterion C 18.9132
## 4    4   TAU  17    7        -10      4.804502 6.794592 criterion C 18.9132
## 5    5   TAU  20   13         -7      4.804502 6.794592 criterion C 18.9132
## 6    6 BtheB  18   11         -7      4.804502 6.794592 criterion C 18.9132
##         RCI            class_JTRCI
## 1 -1.766111              unchanged
## 2 -1.766111 non reliably recovered
## 3 -1.030231 non reliably recovered
## 4 -1.471759 non reliably recovered
## 5 -1.030231 non reliably recovered
## 6 -1.030231 non reliably recovered

15.5.1.1 別パッケージでRCI算出

suddengainsパッケージ

# メタ分析による再検査信頼性
test_retest_bdi <- 0.75

# 8mの欠損を除外したtime1でのBDIのSD
bdi_pre_sd <- 
  btheb |> 
  drop_na(bdi_8m) |> 
  summarise(sd(bdi_pre, na.rm = TRUE)) |> 
  pull()

# RCI算出
suddengains::define_crit1_cutoff(sd = bdi_pre_sd,
                    reliability = test_retest_bdi)

## $sd
## [1] 9.609004
## 
## $reliability
## [1] 0.75
## 
## $standard_error_measurement
## [1] 4.804502
## 
## $standard_error_difference
## [1] 6.794592
## 
## $reliable_change_value
## [1] 13.3174

15.5.1.2 RCI手計算

# pre SD
bdi_pre_sd

## [1] 9.609004

# standard error
se_index <- 
bdi_pre_sd*sqrt(1-test_retest_bdi)
se_index

## [1] 4.804502

# standar error difference
se_d <- 
  sqrt(2*se_index^2)
se_d

## [1] 6.794592

# pre - postが次の値を超えるとreliableな値

1.96*se_d

## [1] 13.3174

16 csv読み込み

16.1 基本

library(vroom)
vroom("data/penguins.csv")

## # A tibble: 344 × 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <chr>   <chr>              <dbl>         <dbl>             <dbl>       <dbl>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           NA            NA                  NA          NA
##  5 Adelie  Torgersen           36.7          19.3               193        3450
##  6 Adelie  Torgersen           39.3          20.6               190        3650
##  7 Adelie  Torgersen           38.9          17.8               181        3625
##  8 Adelie  Torgersen           39.2          19.6               195        4675
##  9 Adelie  Torgersen           34.1          18.1               193        3475
## 10 Adelie  Torgersen           42            20.2               190        4250
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <chr>, year <dbl>

16.2 読み込む変数選択

16.2.1 列を特定

vroom("data/penguins.csv",
      col_select = c(species, sex))

## # A tibble: 344 × 2
##    species sex   
##    <chr>   <chr> 
##  1 Adelie  male  
##  2 Adelie  female
##  3 Adelie  female
##  4 Adelie  <NA>  
##  5 Adelie  female
##  6 Adelie  male  
##  7 Adelie  female
##  8 Adelie  male  
##  9 Adelie  <NA>  
## 10 Adelie  <NA>  
## # ℹ 334 more rows

16.2.2 ヘルパー関数

vroom("data/penguins.csv",
      col_select = starts_with("bill"))

## # A tibble: 344 × 2
##    bill_length_mm bill_depth_mm
##             <dbl>         <dbl>
##  1           39.1          18.7
##  2           39.5          17.4
##  3           40.3          18  
##  4           NA            NA  
##  5           36.7          19.3
##  6           39.3          20.6
##  7           38.9          17.8
##  8           39.2          19.6
##  9           34.1          18.1
## 10           42            20.2
## # ℹ 334 more rows

16.2.3 読まない列

vroom("data/penguins.csv",
      col_select = !c(species, island))

## # A tibble: 344 × 6
##    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex     year
##             <dbl>         <dbl>             <dbl>       <dbl> <chr>  <dbl>
##  1           39.1          18.7               181        3750 male    2007
##  2           39.5          17.4               186        3800 female  2007
##  3           40.3          18                 195        3250 female  2007
##  4           NA            NA                  NA          NA <NA>    2007
##  5           36.7          19.3               193        3450 female  2007
##  6           39.3          20.6               190        3650 male    2007
##  7           38.9          17.8               181        3625 female  2007
##  8           39.2          19.6               195        4675 male    2007
##  9           34.1          18.1               193        3475 <NA>    2007
## 10           42            20.2               190        4250 <NA>    2007
## # ℹ 334 more rows

vroom("data/penguins.csv",
      col_select = list(!c(starts_with("bill"), starts_with("s"))))

## # A tibble: 344 × 4
##    island    flipper_length_mm body_mass_g  year
##    <chr>                 <dbl>       <dbl> <dbl>
##  1 Torgersen               181        3750  2007
##  2 Torgersen               186        3800  2007
##  3 Torgersen               195        3250  2007
##  4 Torgersen                NA          NA  2007
##  5 Torgersen               193        3450  2007
##  6 Torgersen               190        3650  2007
##  7 Torgersen               181        3625  2007
##  8 Torgersen               195        4675  2007
##  9 Torgersen               193        3475  2007
## 10 Torgersen               190        4250  2007
## # ℹ 334 more rows

16.3 読み込む型を指定

16.3.1 変数ごとに指定

因子型 “f”
skip “_”
文字型 “c”

vroom("data/penguins.csv",
      col_types = c(species = "f", island = "_", bill_length_mm = "c"))

## # A tibble: 344 × 7
##    species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex   
##    <fct>   <chr>                  <dbl>             <dbl>       <dbl> <chr> 
##  1 Adelie  39.1                    18.7               181        3750 male  
##  2 Adelie  39.5                    17.4               186        3800 female
##  3 Adelie  40.3                    18                 195        3250 female
##  4 Adelie  <NA>                    NA                  NA          NA <NA>  
##  5 Adelie  36.7                    19.3               193        3450 female
##  6 Adelie  39.3                    20.6               190        3650 male  
##  7 Adelie  38.9                    17.8               181        3625 female
##  8 Adelie  39.2                    19.6               195        4675 male  
##  9 Adelie  34.1                    18.1               193        3475 <NA>  
## 10 Adelie  42                      20.2               190        4250 <NA>  
## # ℹ 334 more rows
## # ℹ 1 more variable: year <dbl>

16.3.2 すべて文字型で

vroom("data/penguins.csv",
      col_types = c(.default = "c"))

## # A tibble: 344 × 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <chr>   <chr>     <chr>          <chr>         <chr>             <chr>      
##  1 Adelie  Torgersen 39.1           18.7          181               3750       
##  2 Adelie  Torgersen 39.5           17.4          186               3800       
##  3 Adelie  Torgersen 40.3           18            195               3250       
##  4 Adelie  Torgersen <NA>           <NA>          <NA>              <NA>       
##  5 Adelie  Torgersen 36.7           19.3          193               3450       
##  6 Adelie  Torgersen 39.3           20.6          190               3650       
##  7 Adelie  Torgersen 38.9           17.8          181               3625       
##  8 Adelie  Torgersen 39.2           19.6          195               4675       
##  9 Adelie  Torgersen 34.1           18.1          193               3475       
## 10 Adelie  Torgersen 42             20.2          190               4250       
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <chr>, year <chr>

17 dplyrあれこれ

17.1 select

17.1.1 変数の並びをアルファベット順にソートする

magrittr %>% じゃないと動かない

df <- 
  tribble(~item10, ~item1, ~item100,  ~item5, ~id,
                10,     1,      100,       5,   1) # データの値に特に意味はない

df %>% 
  select(sort(names(.)))

## # A tibble: 1 × 5
##      id item1 item10 item100 item5
##   <dbl> <dbl>  <dbl>   <dbl> <dbl>
## 1     1     1     10     100     5

ただし，数字の並びは昇順にならない

17.1.2 変数の並びを数字順にする

df |> 
  select(id, num_range("item", 1:100))

## # A tibble: 1 × 5
##      id item1 item5 item10 item100
##   <dbl> <dbl> <dbl>  <dbl>   <dbl>
## 1     1     1     5     10     100

17.2 rowごとの処理

https://dplyr.tidyverse.org/articles/rowwise.html

17.2.1 そのまま

df <- 
  tibble(a = c(1,  1,  1),
         b = c(NA, 2,  2),
         c = c(NA, NA, 3))

df |> 
  mutate(total = a + b + c)

## # A tibble: 3 × 4
##       a     b     c total
##   <dbl> <dbl> <dbl> <dbl>
## 1     1    NA    NA    NA
## 2     1     2    NA    NA
## 3     1     2     3     6

df |> 
  mutate(total = sum(c(a, b, c), na.rm = TRUE))

## # A tibble: 3 × 4
##       a     b     c total
##   <dbl> <dbl> <dbl> <dbl>
## 1     1    NA    NA    10
## 2     1     2    NA    10
## 3     1     2     3    10

df |> 
  mutate(mean_abc = mean(c(a,b,c), na.rm = TRUE))

## # A tibble: 3 × 4
##       a     b     c mean_abc
##   <dbl> <dbl> <dbl>    <dbl>
## 1     1    NA    NA     1.67
## 2     1     2    NA     1.67
## 3     1     2     3     1.67

# 非NAの6セルで合計10を割る
10/6

## [1] 1.666667

17.2.2 rowwise

17.2.2.1 sum

df |> 
  rowwise() |> 
  mutate(total = sum(c(a, b, c), na.rm = TRUE))

## # A tibble: 3 × 4
## # Rowwise: 
##       a     b     c total
##   <dbl> <dbl> <dbl> <dbl>
## 1     1    NA    NA     1
## 2     1     2    NA     3
## 3     1     2     3     6

17.2.2.2 mean

df |> 
  rowwise() |> 
  mutate(mean_abc = mean(c(a, b, c), na.rm = TRUE))

## # A tibble: 3 × 4
## # Rowwise: 
##       a     b     c mean_abc
##   <dbl> <dbl> <dbl>    <dbl>
## 1     1    NA    NA      1  
## 2     1     2    NA      1.5
## 3     1     2     3      2

17.2.2.3 selection helperを使う

df |> 
  rowwise() |> 
  mutate(mean_abc = mean(c_across(a:c), na.rm = TRUE))

## # A tibble: 3 × 4
## # Rowwise: 
##       a     b     c mean_abc
##   <dbl> <dbl> <dbl>    <dbl>
## 1     1    NA    NA      1  
## 2     1     2    NA      1.5
## 3     1     2     3      2

df |> 
  rowwise() |> 
  mutate(mean_abc = mean(c_across(everything()), na.rm = TRUE))

## # A tibble: 3 × 4
## # Rowwise: 
##       a     b     c mean_abc
##   <dbl> <dbl> <dbl>    <dbl>
## 1     1    NA    NA      1  
## 2     1     2    NA      1.5
## 3     1     2     3      2

17.2.3 rowSumsとrowMean

こっちの方が早い

df |> 
  mutate(total = rowSums(pick(a:c), na.rm = TRUE))

## # A tibble: 3 × 4
##       a     b     c total
##   <dbl> <dbl> <dbl> <dbl>
## 1     1    NA    NA     1
## 2     1     2    NA     3
## 3     1     2     3     6

df |> 
  mutate(total = rowMeans(pick(a:c), na.rm = TRUE))

## # A tibble: 3 × 4
##       a     b     c total
##   <dbl> <dbl> <dbl> <dbl>
## 1     1    NA    NA   1  
## 2     1     2    NA   1.5
## 3     1     2     3   2

# dplyr < 1.1.0
# df |> 
#   mutate(total = rowSums(across(a:c), na.rm = TRUE))
# 
# df |> 
#   mutate(total = rowMeans(across(a:c), na.rm = TRUE))

17.3 rename

17.3.1 変更対応のリストを代入したい時は名前付きベクトルで

old <- c("species", "island", "bill_length_mm")
new <- c("種", "島", "くちばし長さ")
newvars <- setNames(old, new)

penguins |>
  rename(!!!newvars)

## # A tibble: 344 × 8
##    種     島      くちばし長さ bill_depth_mm flipper_length_mm body_mass_g sex  
##    <fct>  <fct>          <dbl>         <dbl>             <int>       <int> <fct>
##  1 Adelie Torger…         39.1          18.7               181        3750 male 
##  2 Adelie Torger…         39.5          17.4               186        3800 fema…
##  3 Adelie Torger…         40.3          18                 195        3250 fema…
##  4 Adelie Torger…         NA            NA                  NA          NA <NA> 
##  5 Adelie Torger…         36.7          19.3               193        3450 fema…
##  6 Adelie Torger…         39.3          20.6               190        3650 male 
##  7 Adelie Torger…         38.9          17.8               181        3625 fema…
##  8 Adelie Torger…         39.2          19.6               195        4675 male 
##  9 Adelie Torger…         34.1          18.1               193        3475 <NA> 
## 10 Adelie Torger…         42            20.2               190        4250 <NA> 
## # ℹ 334 more rows
## # ℹ 1 more variable: year <int>

17.3.1.1 変更対応リストがデータフレームであるとき

rename_df <- tibble(
old = c("species", "island", "bill_length_mm"),
new = c("種", "島", "くちばし長さ")
)

rename_df

## # A tibble: 3 × 2
##   old            new         
##   <chr>          <chr>       
## 1 species        種          
## 2 island         島          
## 3 bill_length_mm くちばし長さ

rename_df |> 
  relocate(new) |> # newを最初に並べ替え
deframe()

##               種               島     くちばし長さ 
##        "species"         "island" "bill_length_mm"

17.4 mutate

17.4.1 case_whenで値ではなく関数や式を指定する

tribble(~a, ~b,
        0, 1,
        1, 2,
        2, 3) |> 
  mutate(b = case_when(a == 0 ~ (\(x) x+1)(b),
                       a == 1 ~ (\(x) x+2)(b),
                       .default = b)
  )

## # A tibble: 3 × 2
##       a     b
##   <dbl> <dbl>
## 1     0     2
## 2     1     4
## 3     2     3

17.4.2 対照表を元に値を変換していく

17.4.2.1 case_whenの場合

対照表なし

penguins |> 
  mutate(speciesj = case_when(
                       species == "Adelie" ~ "アデリー",
                       species == "Chinstrap" ~ "ヒゲ",
                       species == "Gentoo" ~ "ジェンツー",
                       .default = "その他"),
                       .keep = "used"
  ) |> 
  count(species, speciesj)

## # A tibble: 3 × 3
##   species   speciesj       n
##   <fct>     <chr>      <int>
## 1 Adelie    アデリー     152
## 2 Chinstrap ヒゲ          68
## 3 Gentoo    ジェンツー   124

  # group_by(species) |> 
  # slice_head(n = 2)

17.4.2.2 case_matchの場合

対照表なし

penguins |> 
  mutate(speciesj = case_match(species,
                       "Adelie" ~ "アデリー",
                       "Chinstrap" ~ "ヒゲ",
                       "Gentoo" ~ "ジェンツー",
                       .default = "その他"
                      ),
                       .keep = "used"
  ) |> 
  count(species, speciesj)

## # A tibble: 3 × 3
##   species   speciesj       n
##   <fct>     <chr>      <int>
## 1 Adelie    アデリー     152
## 2 Chinstrap ヒゲ          68
## 3 Gentoo    ジェンツー   124

  # group_by(species) |> 
  # slice_head(n = 2)

17.4.2.3 left_joinでも置き換えと同様の結果を得られる

対照表あり。大量の置き換えならこれが最もお手軽な気がする
- reference table and left_join

old <- 
  penguins |> 
  count(species) |> 
  pull(species)

new <- c("アデリー", "ヒゲ",   "ジェンツー")

taisho <- 
  tibble(species = old,
         speciesj = new)

penguins |> 
  left_join(taisho) |> 
  count(species, speciesj)

## # A tibble: 3 × 3
##   species   speciesj       n
##   <fct>     <chr>      <int>
## 1 Adelie    アデリー     152
## 2 Chinstrap ヒゲ          68
## 3 Gentoo    ジェンツー   124

17.4.2.4 名前付きベクトルで置き換え

対照表は2つのベクトル
https://gist.github.com/IndrajeetPatil/baff8cf625d9fcd5ef2a36f520373426
https://www.r-bloggers.com/2018/01/recode-values-with-character-subsetting/

new_old <- 
  new |>  
  set_names(old)

# オブジェクト作らずnewを上書き
# names(new) <- old

penguins |> 
  mutate(speciesj = new_old[species]) |> 
  count(species, speciesj)

## # A tibble: 3 × 3
##   species   speciesj       n
##   <fct>     <chr>      <int>
## 1 Adelie    アデリー     152
## 2 Chinstrap ヒゲ          68
## 3 Gentoo    ジェンツー   124

17.4.3 複数回答が1つの列にテキストで入っている時，回答別のフラグ列を作成

17.4.3.1 サンプルデータ作成

# 区切りにスペースなし
df <- tribble(~id, ~multi_answer,
              1, "りんご,みかん,ぶどう",
              2, "りんご,ぶどう",
              3, "メロン,ぶどう",
              4, "ぶどう,りんご,みかん,メロン")

df

## # A tibble: 4 × 2
##      id multi_answer               
##   <dbl> <chr>                      
## 1     1 りんご,みかん,ぶどう       
## 2     2 りんご,ぶどう              
## 3     3 メロン,ぶどう              
## 4     4 ぶどう,りんご,みかん,メロン

# スペース入っている場合
# tribble(~id, ~multi_answer,
#         1, "りんご, みかん, ぶどう",
#         2, "りんご, ぶどう",
#         3, "メロン, ぶどう",
#         4, "ぶどう, りんご, みかん, メロン") |> 
#   mutate(multi_answer = str_remove_all(multi_answer, " ")) # スペース除去

17.4.3.2 文字列をリストに分割して新しい列に格納

df <- 
  df |> 
  mutate(multi_answer_l = str_split(multi_answer, ",")) # 区切りは","

df

## # A tibble: 4 × 3
##      id multi_answer                multi_answer_l
##   <dbl> <chr>                       <list>        
## 1     1 りんご,みかん,ぶどう        <chr [3]>     
## 2     2 りんご,ぶどう               <chr [2]>     
## 3     3 メロン,ぶどう               <chr [2]>     
## 4     4 ぶどう,りんご,みかん,メロン <chr [4]>

17.4.3.3 各列に分割

df <- 
  df |> tidyr::hoist(multi_answer_l, 
             # 列名指定   
                     f1 = 1, 
                     f2 = 2, 
                     f3 = 3, 
                     f4 = 4) 

df

## # A tibble: 4 × 6
##      id multi_answer                f1     f2     f3     f4    
##   <dbl> <chr>                       <chr>  <chr>  <chr>  <chr> 
## 1     1 りんご,みかん,ぶどう        りんご みかん ぶどう <NA>  
## 2     2 りんご,ぶどう               りんご ぶどう <NA>   <NA>  
## 3     3 メロン,ぶどう               メロン ぶどう <NA>   <NA>  
## 4     4 ぶどう,りんご,みかん,メロン ぶどう りんご みかん メロン

17.4.3.4 選択肢ごとにフラグを表す列を追加

 df |> 
  mutate(りんご = case_when(f1 == "りんご" |
                            f2 == "りんご" |
                            f3 == "りんご" |
                            f4 == "りんご" ~ 1,
                                      TRUE ~ 0),
         みかん = case_when(f1 == "みかん" |
                            f2 == "みかん" |
                            f3 == "みかん" |
                            f4 == "みかん" ~ 1,
                                      TRUE ~ 0),
         ぶどう = case_when(f1 == "ぶどう" |
                            f2 == "ぶどう" |
                            f3 == "ぶどう" |
                            f4 == "ぶどう" ~ 1,
                                      TRUE ~ 0),
         メロン = case_when(f1 == "メロン" |
                            f2 == "メロン" |
                            f3 == "メロン" |
                            f4 == "メロン" ~ 1,
                                      TRUE ~ 0)
  )

## # A tibble: 4 × 10
##      id multi_answer         f1    f2    f3    f4    りんご みかん ぶどう メロン
##   <dbl> <chr>                <chr> <chr> <chr> <chr>  <dbl>  <dbl>  <dbl>  <dbl>
## 1     1 りんご,みかん,ぶどう りん… みか… ぶど… <NA>       1      1      1      0
## 2     2 りんご,ぶどう        りん… ぶど… <NA>  <NA>       1      0      1      0
## 3     3 メロン,ぶどう        メロ… ぶど… <NA>  <NA>       0      0      1      1
## 4     4 ぶどう,りんご,みか…  ぶど… りん… みか… メロ…      1      1      1      1

17.4.3.4.1 【効率化】

17.4.3.4.1.1 関数作成

# 回答選択肢ごとに列を作る関数作成
multi_answer <- function(data, answer){
  data |> 
    mutate(!!sym({{answer}}) := case_when(f1 == answer |
                                  　　　  f2 == answer |
                                  　　　  f3 == answer |
                                  　　　  f4 == answer ~ 1,
                                                　TRUE ~ 0))
}


multi_answer(df, "りんご")

## # A tibble: 4 × 7
##      id multi_answer                f1     f2     f3     f4     りんご
##   <dbl> <chr>                       <chr>  <chr>  <chr>  <chr>   <dbl>
## 1     1 りんご,みかん,ぶどう        りんご みかん ぶどう <NA>        1
## 2     2 りんご,ぶどう               りんご ぶどう <NA>   <NA>        1
## 3     3 メロン,ぶどう               メロン ぶどう <NA>   <NA>        0
## 4     4 ぶどう,りんご,みかん,メロン ぶどう りんご みかん メロン      1

17.4.3.4.1.2 関数適用

multi_answer(df, "りんご") |> 
multi_answer("みかん") |> 
multi_answer("ぶどう") |> 
multi_answer("メロン")

## # A tibble: 4 × 10
##      id multi_answer         f1    f2    f3    f4    りんご みかん ぶどう メロン
##   <dbl> <chr>                <chr> <chr> <chr> <chr>  <dbl>  <dbl>  <dbl>  <dbl>
## 1     1 りんご,みかん,ぶどう りん… みか… ぶど… <NA>       1      1      1      0
## 2     2 りんご,ぶどう        りん… ぶど… <NA>  <NA>       1      0      1      0
## 3     3 メロン,ぶどう        メロ… ぶど… <NA>  <NA>       0      0      1      1
## 4     4 ぶどう,りんご,みか…  ぶど… りん… みか… メロ…      1      1      1      1

17.4.3.4.2 【さらに効率化】

contents <- 
  c("りんご", "みかん", "ぶどう", "メロン")

df_multi <- 
  map(contents, \(x) multi_answer(df, x)) |> 
  reduce(full_join)

df_multi

## # A tibble: 4 × 10
##      id multi_answer         f1    f2    f3    f4    りんご みかん ぶどう メロン
##   <dbl> <chr>                <chr> <chr> <chr> <chr>  <dbl>  <dbl>  <dbl>  <dbl>
## 1     1 りんご,みかん,ぶどう りん… みか… ぶど… <NA>       1      1      1      0
## 2     2 りんご,ぶどう        りん… ぶど… <NA>  <NA>       1      0      1      0
## 3     3 メロン,ぶどう        メロ… ぶど… <NA>  <NA>       0      0      1      1
## 4     4 ぶどう,りんご,みか…  ぶど… りん… みか… メロ…      1      1      1      1

17.5 across周り

17.5.1 作成する変数の名前をコントロール

across()の中で.names = 引数を入れる

penguins |> 
  summarise(across(c(bill_length_mm, bill_depth_mm),
                   list(
                     mean = \(x) mean(x, na.rm = TRUE),
                     n    = \(x) sum(!is.na(x))),
                   .names = "{.fn}_{.col}"
                   )
            
            )

## # A tibble: 1 × 4
##   mean_bill_length_mm n_bill_length_mm mean_bill_depth_mm n_bill_depth_mm
##                 <dbl>            <int>              <dbl>           <int>
## 1                43.9              342               17.2             342

18 tidyrあれこれ

18.1 pivot_wider

18.1.1 基本

mtcars |> 
  count(vs)

##   vs  n
## 1  0 18
## 2  1 14

mtcars |> 
  count(vs) |> 
  pivot_wider(names_from = vs,
              values_from = n)

## # A tibble: 1 × 2
##     `0`   `1`
##   <int> <int>
## 1    18    14

names_fromに複数列指定

mtcars |> 
  count(vs) |> 
  mutate(name = "vs")

##   vs  n name
## 1  0 18   vs
## 2  1 14   vs

mtcars |> 
  count(vs) |>
  mutate(name = "vs") |> 
  pivot_wider(names_from = c(name,vs),
              values_from = n)

## # A tibble: 1 × 2
##    vs_0  vs_1
##   <int> <int>
## 1    18    14

names_glue(namesの変数1つ)

mtcars |> 
  count(vs) |>
  pivot_wider(names_from = vs,
              names_glue ="{.value}_{.name}",
              values_from = n)

## # A tibble: 1 × 2
##     n_0   n_1
##   <int> <int>
## 1    18    14

names_glue(namesの列名指定)

mtcars |> 
  count(vs) |>
  pivot_wider(names_from = vs,
              names_glue ="{.value}_{vs}",
              values_from = n)

## # A tibble: 1 × 2
##     n_0   n_1
##   <int> <int>
## 1    18    14

names_glue(namesの変数2つ)

mtcars |> 
     count(vs) |>
     mutate(name = "vs") |> 
     pivot_wider(names_from = c(name,vs),
                 names_glue ="{.value}_{.name}",
                 values_from = n)

## # A tibble: 1 × 2
##   n_vs_0 n_vs_1
##    <int>  <int>
## 1     18     14

18.1.2 (🚧作成中)関数にして複数変数への適用

19 回帰

19.1 線形

res <- 
  lm(mpg ~ ., data = mtcars)

broom::tidy(res)

## # A tibble: 11 × 5
##    term        estimate std.error statistic p.value
##    <chr>          <dbl>     <dbl>     <dbl>   <dbl>
##  1 (Intercept)  12.3      18.7        0.657  0.518 
##  2 cyl          -0.111     1.05      -0.107  0.916 
##  3 disp          0.0133    0.0179     0.747  0.463 
##  4 hp           -0.0215    0.0218    -0.987  0.335 
##  5 drat          0.787     1.64       0.481  0.635 
##  6 wt           -3.72      1.89      -1.96   0.0633
##  7 qsec          0.821     0.731      1.12   0.274 
##  8 vs            0.318     2.10       0.151  0.881 
##  9 am            2.52      2.06       1.23   0.234 
## 10 gear          0.655     1.49       0.439  0.665 
## 11 carb         -0.199     0.829     -0.241  0.812

19.1.1 標準化係数

res2 <- 
  lm.beta::lm.beta(res)

broom::tidy(res2)

## # A tibble: 11 × 6
##    term        estimate std_estimate std.error statistic p.value
##    <chr>          <dbl>        <dbl>     <dbl>     <dbl>   <dbl>
##  1 (Intercept)  12.3         NA        18.7        0.657  0.518 
##  2 cyl          -0.111       -0.0330    1.05      -0.107  0.916 
##  3 disp          0.0133       0.274     0.0179     0.747  0.463 
##  4 hp           -0.0215      -0.244     0.0218    -0.987  0.335 
##  5 drat          0.787        0.0698    1.64       0.481  0.635 
##  6 wt           -3.72        -0.603     1.89      -1.96   0.0633
##  7 qsec          0.821        0.243     0.731      1.12   0.274 
##  8 vs            0.318        0.0266    2.10       0.151  0.881 
##  9 am            2.52         0.209     2.06       1.23   0.234 
## 10 gear          0.655        0.0802    1.49       0.439  0.665 
## 11 carb         -0.199       -0.0534    0.829     -0.241  0.812

20 相関

20.1 基本

https://corrr.tidymodels.org/

library(corrr)

mtcars |> 
  correlate()

## # A tibble: 11 × 12
##    term     mpg    cyl   disp     hp    drat     wt    qsec     vs      am
##    <chr>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>   <dbl>  <dbl>   <dbl>
##  1 mpg   NA     -0.852 -0.848 -0.776  0.681  -0.868  0.419   0.664  0.600 
##  2 cyl   -0.852 NA      0.902  0.832 -0.700   0.782 -0.591  -0.811 -0.523 
##  3 disp  -0.848  0.902 NA      0.791 -0.710   0.888 -0.434  -0.710 -0.591 
##  4 hp    -0.776  0.832  0.791 NA     -0.449   0.659 -0.708  -0.723 -0.243 
##  5 drat   0.681 -0.700 -0.710 -0.449 NA      -0.712  0.0912  0.440  0.713 
##  6 wt    -0.868  0.782  0.888  0.659 -0.712  NA     -0.175  -0.555 -0.692 
##  7 qsec   0.419 -0.591 -0.434 -0.708  0.0912 -0.175 NA       0.745 -0.230 
##  8 vs     0.664 -0.811 -0.710 -0.723  0.440  -0.555  0.745  NA      0.168 
##  9 am     0.600 -0.523 -0.591 -0.243  0.713  -0.692 -0.230   0.168 NA     
## 10 gear   0.480 -0.493 -0.556 -0.126  0.700  -0.583 -0.213   0.206  0.794 
## 11 carb  -0.551  0.527  0.395  0.750 -0.0908  0.428 -0.656  -0.570  0.0575
## # ℹ 2 more variables: gear <dbl>, carb <dbl>

20.1.1 変数絞って表示

mtcars |> 
  correlate() |> 
  focus(mpg, disp, wt, mirror = TRUE)

## # A tibble: 3 × 4
##   term     mpg   disp     wt
##   <chr>  <dbl>  <dbl>  <dbl>
## 1 mpg   NA     -0.848 -0.868
## 2 disp  -0.848 NA      0.888
## 3 wt    -0.868  0.888 NA

20.1.2 下側のみに

mtcars |> 
  correlate() |> 
  focus(mpg, disp, wt, mirror = TRUE) |> 
  shave()

## # A tibble: 3 × 4
##   term     mpg   disp    wt
##   <chr>  <dbl>  <dbl> <dbl>
## 1 mpg   NA     NA        NA
## 2 disp  -0.848 NA        NA
## 3 wt    -0.868  0.888    NA

20.1.3 ペアワイズ時のnを算出

psychTools::bfi |>
  select(A1,A2,C1,C2) |> 
  pair_n()

##      A1   A2   C1   C2
## A1 2784 2757 2764 2761
## A2 2757 2773 2752 2752
## C1 2764 2752 2779 2755
## C2 2761 2752 2755 2776
## attr(,"class")
## [1] "n_mat"  "matrix"

# データフレームに

psychTools::bfi |>
  select(A1,A2,C1,C2) |> 
  pair_n() |> 
  as_tibble()

## # A tibble: 4 × 4
##   A1      A2      C1      C2     
##   <n_mat> <n_mat> <n_mat> <n_mat>
## 1 2784    2757    2764    2761   
## 2 2757    2773    2752    2752   
## 3 2764    2752    2779    2755   
## 4 2761    2752    2755    2776

20.2 層別

https://stackoverflow.com/questions/56243874/tidyverse-correlations-among-multiple-columns-grouped-by-other-column

mtcars |>
  select(mpg, disp, wt, cyl) |>
  nest(data = !cyl) |>
  mutate(
    correlations = map(data, correlate)
  ) |>
  unnest(correlations)

## # A tibble: 9 × 6
##     cyl data              term     mpg   disp     wt
##   <dbl> <list>            <chr>  <dbl>  <dbl>  <dbl>
## 1     6 <tibble [7 × 3]>  mpg   NA      0.103 -0.682
## 2     6 <tibble [7 × 3]>  disp   0.103 NA      0.473
## 3     6 <tibble [7 × 3]>  wt    -0.682  0.473 NA    
## 4     4 <tibble [11 × 3]> mpg   NA     -0.805 -0.713
## 5     4 <tibble [11 × 3]> disp  -0.805 NA      0.857
## 6     4 <tibble [11 × 3]> wt    -0.713  0.857 NA    
## 7     8 <tibble [14 × 3]> mpg   NA     -0.520 -0.650
## 8     8 <tibble [14 × 3]> disp  -0.520 NA      0.755
## 9     8 <tibble [14 × 3]> wt    -0.650  0.755 NA

20.2.1 1つのデータフレームに

mtcars |> 
  select(mpg, disp, wt, cyl) |> 
  group_by(cyl) |>
  group_modify(~ correlate(.x))

## # A tibble: 9 × 5
## # Groups:   cyl [3]
##     cyl term     mpg   disp     wt
##   <dbl> <chr>  <dbl>  <dbl>  <dbl>
## 1     4 mpg   NA     -0.805 -0.713
## 2     4 disp  -0.805 NA      0.857
## 3     4 wt    -0.713  0.857 NA    
## 4     6 mpg   NA      0.103 -0.682
## 5     6 disp   0.103 NA      0.473
## 6     6 wt    -0.682  0.473 NA    
## 7     8 mpg   NA     -0.520 -0.650
## 8     8 disp  -0.520 NA      0.755
## 9     8 wt    -0.650  0.755 NA

20.2.2 listのままにして加工してから1つのデータフレームに

# groupにする変数の水準を抜き出す
cyl_level <- 
mtcars |> 
  count(cyl) |> 
  pull(cyl)

mtcars |> 
  select(mpg, disp, wt, cyl) |> 
  group_by(cyl) |> 
  group_map(~ correlate(.)) |>  # ここは旧表記じゃないと動かない
  map(\(x) shave(x)) |>         # 相関行列の上側を
  set_names(cyl_level) |>       # group変数の水準を適用
  bind_rows(.id = "cyl_level")

## # A tibble: 9 × 5
##   cyl_level term     mpg   disp    wt
##   <chr>     <chr>  <dbl>  <dbl> <dbl>
## 1 4         mpg   NA     NA        NA
## 2 4         disp  -0.805 NA        NA
## 3 4         wt    -0.713  0.857    NA
## 4 6         mpg   NA     NA        NA
## 5 6         disp   0.103 NA        NA
## 6 6         wt    -0.682  0.473    NA
## 7 8         mpg   NA     NA        NA
## 8 8         disp  -0.520 NA        NA
## 9 8         wt    -0.650  0.755    NA

21 時間測定

21.1 tictoc

tictoc::tic()
# tic()とtoc()の間に処理を挟む
Sys.sleep(1) # 1秒待つ
tictoc::toc()

## 1.03 sec elapsed

21.1.1 タイトルをつけてログを残す

tictoc::tic("1秒待つ")
Sys.sleep(1)
tictoc::toc(log = TRUE)

## 1秒待つ: 1.02 sec elapsed

tictoc::tic("2秒待つ")
Sys.sleep(2)
tictoc::toc(log = TRUE)

## 2秒待つ: 2.03 sec elapsed

21.1.1.1 ログ出力

tictoc::tic.log() |> 
  unlist() |> 
  tibble()

## # A tibble: 2 × 1
##   `unlist(tictoc::tic.log())`
##   <chr>                      
## 1 1秒待つ: 1.02 sec elapsed  
## 2 2秒待つ: 2.03 sec elapsed

22 コードの書式修正

remedyパッケージを読み込んだ後に修正したい範囲を指定し以下を実行
- <- Addins > Align
- = Addins > AlignEqual
https://thinkr-open.github.io/remedy/index.html

22.1 関連ショートカット

ctrl + I 再インデント（階層構造をきれいにする）
ctrl + shift + A 再フォーマット（複数行にする）

22.1.1 例　

mtcars |>  
select(vs,am,gear)

23 引用関連

23.1 RとR Studioまたはパッケージの引用

Rを引用する(RjpWiki) 　- citation()
Citing R(An Introduction to R)
- citation("パッケージ名")
Citing RStudio
- RStudio.Version()

23.2 R Studioでの文献引用

RStudio 1.4 Preview: Citations

24 ✍Webスクレイピング

25 R Markdown

25.1 数式

Mathematics in R Markdown

25.2 チャンクの色

https://bookdown.org/yihui/rmarkdown-cookbook/chunk-styling.html

# {r class.source="bg-info"}
head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

# {r class.source="bg-primary"}

# {r class.source="bg-success"}

# {r class.source="bg-warning"}

# {r class.source="bg-danger"}

25.2.1 出力チャンクと色分け

# {r class.source="bg-danger", class.output="bg-warning"}

head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

26 Excel関連

26.1 openxlsx

https://ycphs.github.io/openxlsx/index.html

# 空のワークブックとシートを作成
library(openxlsx)
wb <- createWorkbook()
addWorksheet(wb, "ペンギン")
addWorksheet(wb, "ペンギンraw")

# フォントをデフォルトからMS Pゴシックに変更
modifyBaseFont(wb, "MS PGothic")

# データをwbに入れる
writeData(wb, sheet = "ペンギン", penguins, startCol = 1, startRow = 1)
writeData(wb, sheet = "ペンギンraw", penguins_raw, startCol = 1, startRow = 1)

# 先頭行および先頭列固定
freezePane(wb, "ペンギン", firstRow = TRUE, firstCol = TRUE) 
freezePane(wb, "ペンギンraw", firstRow = TRUE, firstCol = TRUE)


# 保存
saveWorkbook(wb, "out/penguins.xlsx", overwrite = TRUE)

27 R以外の関連メモ

27.1 画面の動画キャプチャをgifファイルに

動画だけならwindowsキー + G でもできる
ScreenToGifを使う

practice_misc

やわらかクジラ

2024/07/21

1 パッケージ読み込み

1.1 パッケージ入ってない場合はインストールして読み込む場合（コードのみ表示）

2 github連携

2.1 githubのレポジトリをローカルに作成して管理

3 ベクトル操作

3.1 作成

3.2 要素を削除

3.3 要素のソート

4 正規表現

4.1 数字,文字

4.1.1 例の作成

4.1.2 数字

一字

三字

4.1.3 アルファベット

アルファベットのみ

日本語も

str_view_all

文字数指定

4.1.4 数字とアルファベット

一文字

すべて

指定文字数

4.2 単語のつながり

4.2.1 例の作成

4.2.2 繰り返し

一回

二回

4.3 区分する点を探す

4.3.1 検討用データフレーム作成

4.3.1.1 meanとsdを列名にするpivot_longer(×な例)

4.3.2 変数名の文字の長さ確認

4.3.2.1 meanとsdを列名にするpivot_longer(okな例)

4.3.2.1.1 {n}の挙動

文字

数字

4.4 連続量を2値化して割合を出して1つのデータフレームに

4.4.1 2値変数作成

4.4.2 割合とnとtotal_nを算出

4.4.3 meanとsdを列名にするpivot_longer

5 度数分布とクロス表

5.1 度数分布

5.1.1 文字型でデフォルト

5.1.1.1 NA外して

5.1.1.2 countで%

5.1.2 因子型

5.1.2.1 countでデータのない水準もn=0で示す

5.1.2.2 tabylでデータのない水準もn=0で示してNAは計算に入れずtotalも示す

5.1.3 数字

5.2 クロス表

5.2.1 count

5.2.1.1 割合追加

5.2.2 欠損ある場合

5.2.2.1 割合追加

5.2.2.1.1 欠損あり

5.2.2.1.2 欠損drop

5.2.3 tabyl

5.2.3.1 割合

割合とn

割合（%）とn

nと割合（%)

5.2.4 3要因以上

5.3 modelsummary::datasummary()

6 関数

6.1 基本

6.1.1 省略形

6.1.2 関数内で数値または文字

6.2 無名関数 anonymous function

6.2.1 無名関数の省略形（R4.1以降）

6.2.1.1 \（バックスラッシュ）の表示について

6.3 関数の中身確認

6.4 平均値算出

6.4.1 平均値とn

6.4.1.1

6.4.1.2 確認

6.4.2 平均値とnで列をmeanとnに

6.4.2.1 確認

3.1 作成　

6.2.1.1 `\`（バックスラッシュ）の表示について