Nick's R Cheat Sheet

PDF of many of Rstudio’s cheat sheets

Another R cheat sheet I found useful

Selecting attributes from a data frame

# data [row, attribute_name]
iris[ 1, "Species"] 
#> [1] setosa
#> Levels: setosa versicolor virginica

# approach 1: use [[ form of extract operator to extract a column
iris[["Species"]] %>% 
  head()
#> [1] setosa setosa setosa setosa setosa setosa
#> Levels: setosa versicolor virginica

# approach 2: use variable name in column dimension of data frame
iris[,"Species"] %>% 
  head()
#> [1] setosa setosa setosa setosa setosa setosa
#> Levels: setosa versicolor virginica
# approach 3: use the $ form of extract operator. Note that since this
iris$Species %>% 
  head()
#> [1] setosa setosa setosa setosa setosa setosa
#> Levels: setosa versicolor virginica

#Use individual variables in a pipe
ggplot2::diamonds %>% 
  .$cut %>% 
  head()
#> [1] Ideal     Premium   Good      Premium   Good      Very Good
#> Levels: Fair < Good < Very Good < Premium < Ideal

ggplot2::diamonds %>% 
  dplyr::pull(cut) %>% 
  head()
#> [1] Ideal     Premium   Good      Premium   Good      Very Good
#> Levels: Fair < Good < Very Good < Premium < Ideal

subsetting rows

# approach 1: use exact row references
# dataframe[rows, columns]
mtcars[20:22,]
#>                   mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> Toyota Corolla   33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
#> Toyota Corona    21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
#> Dodge Challenger 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2

# approach 2: use logic in the row dimension of reference
head(mtcars[mtcars$cyl == 4 & mtcars$am == 1,])
#>                 mpg cyl  disp hp drat    wt  qsec vs am gear carb
#> Datsun 710     22.8   4 108.0 93 3.85 2.320 18.61  1  1    4    1
#> Fiat 128       32.4   4  78.7 66 4.08 2.200 19.47  1  1    4    1
#> Honda Civic    30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2
#> Toyota Corolla 33.9   4  71.1 65 4.22 1.835 19.90  1  1    4    1
#> Fiat X1-9      27.3   4  79.0 66 4.08 1.935 18.90  1  1    4    1
#> Porsche 914-2  26.0   4 120.3 91 4.43 2.140 16.70  0  1    5    2
head(mtcars[mtcars[,"cyl"] == 4,])
#>                 mpg cyl  disp hp drat    wt  qsec vs am gear carb
#> Datsun 710     22.8   4 108.0 93 3.85 2.320 18.61  1  1    4    1
#> Merc 240D      24.4   4 146.7 62 3.69 3.190 20.00  1  0    4    2
#> Merc 230       22.8   4 140.8 95 3.92 3.150 22.90  1  0    4    2
#> Fiat 128       32.4   4  78.7 66 4.08 2.200 19.47  1  1    4    1
#> Honda Civic    30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2
#> Toyota Corolla 33.9   4  71.1 65 4.22 1.835 19.90  1  1    4    1

# approach 3: use which() function
theSubsetRows <- which(mtcars$cyl == 4)
head(mtcars[theSubsetRows,])
#>                 mpg cyl  disp hp drat    wt  qsec vs am gear carb
#> Datsun 710     22.8   4 108.0 93 3.85 2.320 18.61  1  1    4    1
#> Merc 240D      24.4   4 146.7 62 3.69 3.190 20.00  1  0    4    2
#> Merc 230       22.8   4 140.8 95 3.92 3.150 22.90  1  0    4    2
#> Fiat 128       32.4   4  78.7 66 4.08 2.200 19.47  1  1    4    1
#> Honda Civic    30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2
#> Toyota Corolla 33.9   4  71.1 65 4.22 1.835 19.90  1  1    4    1

# approach 4: use output from a function that returns a logical
#             array instead of row numbers as in the prior example
head(mtcars[!is.na(mtcars[,"cyl"]),])
#>                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
#> Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
#> Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
#> Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
#> Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
#> Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
#> Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

# subset all rows with select  columns
# dataframe[, c("column1", "column2")]
head(mtcars[, c("mpg", "carb")])
#>                    mpg carb
#> Mazda RX4         21.0    4
#> Mazda RX4 Wag     21.0    4
#> Datsun 710        22.8    1
#> Hornet 4 Drive    21.4    1
#> Hornet Sportabout 18.7    2
#> Valiant           18.1    1

# subset all rows where attribute matches
# dataframe[dataframe$attribute =="value", ] 
mtcars[mtcars$cyl==4,]
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
#> Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
#> Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
#> Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
#> Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
#> Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
#> Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
#> Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
#> Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2


# select all rows and all columns except for a few others
# in this example remove mpg and disp, the first and third columns
mtcars[,-c(1,3)]
#>                     cyl  hp drat    wt  qsec vs am gear carb
#> Mazda RX4             6 110 3.90 2.620 16.46  0  1    4    4
#> Mazda RX4 Wag         6 110 3.90 2.875 17.02  0  1    4    4
#> Datsun 710            4  93 3.85 2.320 18.61  1  1    4    1
#> Hornet 4 Drive        6 110 3.08 3.215 19.44  1  0    3    1
#> Hornet Sportabout     8 175 3.15 3.440 17.02  0  0    3    2
#> Valiant               6 105 2.76 3.460 20.22  1  0    3    1
#> Duster 360            8 245 3.21 3.570 15.84  0  0    3    4
#> Merc 240D             4  62 3.69 3.190 20.00  1  0    4    2
#> Merc 230              4  95 3.92 3.150 22.90  1  0    4    2
#> Merc 280              6 123 3.92 3.440 18.30  1  0    4    4
#> Merc 280C             6 123 3.92 3.440 18.90  1  0    4    4
#> Merc 450SE            8 180 3.07 4.070 17.40  0  0    3    3
#> Merc 450SL            8 180 3.07 3.730 17.60  0  0    3    3
#> Merc 450SLC           8 180 3.07 3.780 18.00  0  0    3    3
#> Cadillac Fleetwood    8 205 2.93 5.250 17.98  0  0    3    4
#> Lincoln Continental   8 215 3.00 5.424 17.82  0  0    3    4
#> Chrysler Imperial     8 230 3.23 5.345 17.42  0  0    3    4
#> Fiat 128              4  66 4.08 2.200 19.47  1  1    4    1
#> Honda Civic           4  52 4.93 1.615 18.52  1  1    4    2
#> Toyota Corolla        4  65 4.22 1.835 19.90  1  1    4    1
#> Toyota Corona         4  97 3.70 2.465 20.01  1  0    3    1
#> Dodge Challenger      8 150 2.76 3.520 16.87  0  0    3    2
#> AMC Javelin           8 150 3.15 3.435 17.30  0  0    3    2
#> Camaro Z28            8 245 3.73 3.840 15.41  0  0    3    4
#> Pontiac Firebird      8 175 3.08 3.845 17.05  0  0    3    2
#> Fiat X1-9             4  66 4.08 1.935 18.90  1  1    4    1
#> Porsche 914-2         4  91 4.43 2.140 16.70  0  1    5    2
#> Lotus Europa          4 113 3.77 1.513 16.90  1  1    5    2
#> Ford Pantera L        8 264 4.22 3.170 14.50  0  1    5    4
#> Ferrari Dino          6 175 3.62 2.770 15.50  0  1    5    6
#> Maserati Bora         8 335 3.54 3.570 14.60  0  1    5    8
#> Volvo 142E            4 109 4.11 2.780 18.60  1  1    4    2

#alternate - if all rows you want to remove are next to each other
mtcars[,-(1:3)]
#>                      hp drat    wt  qsec vs am gear carb
#> Mazda RX4           110 3.90 2.620 16.46  0  1    4    4
#> Mazda RX4 Wag       110 3.90 2.875 17.02  0  1    4    4
#> Datsun 710           93 3.85 2.320 18.61  1  1    4    1
#> Hornet 4 Drive      110 3.08 3.215 19.44  1  0    3    1
#> Hornet Sportabout   175 3.15 3.440 17.02  0  0    3    2
#> Valiant             105 2.76 3.460 20.22  1  0    3    1
#> Duster 360          245 3.21 3.570 15.84  0  0    3    4
#> Merc 240D            62 3.69 3.190 20.00  1  0    4    2
#> Merc 230             95 3.92 3.150 22.90  1  0    4    2
#> Merc 280            123 3.92 3.440 18.30  1  0    4    4
#> Merc 280C           123 3.92 3.440 18.90  1  0    4    4
#> Merc 450SE          180 3.07 4.070 17.40  0  0    3    3
#> Merc 450SL          180 3.07 3.730 17.60  0  0    3    3
#> Merc 450SLC         180 3.07 3.780 18.00  0  0    3    3
#> Cadillac Fleetwood  205 2.93 5.250 17.98  0  0    3    4
#> Lincoln Continental 215 3.00 5.424 17.82  0  0    3    4
#> Chrysler Imperial   230 3.23 5.345 17.42  0  0    3    4
#> Fiat 128             66 4.08 2.200 19.47  1  1    4    1
#> Honda Civic          52 4.93 1.615 18.52  1  1    4    2
#> Toyota Corolla       65 4.22 1.835 19.90  1  1    4    1
#> Toyota Corona        97 3.70 2.465 20.01  1  0    3    1
#> Dodge Challenger    150 2.76 3.520 16.87  0  0    3    2
#> AMC Javelin         150 3.15 3.435 17.30  0  0    3    2
#> Camaro Z28          245 3.73 3.840 15.41  0  0    3    4
#> Pontiac Firebird    175 3.08 3.845 17.05  0  0    3    2
#> Fiat X1-9            66 4.08 1.935 18.90  1  1    4    1
#> Porsche 914-2        91 4.43 2.140 16.70  0  1    5    2
#> Lotus Europa        113 3.77 1.513 16.90  1  1    5    2
#> Ford Pantera L      264 4.22 3.170 14.50  0  1    5    4
#> Ferrari Dino        175 3.62 2.770 15.50  0  1    5    6
#> Maserati Bora       335 3.54 3.570 14.60  0  1    5    8
#> Volvo 142E          109 4.11 2.780 18.60  1  1    4    2

Filtering / extracting / subsetting data frames based on attribute value

mtcars[which(mtcars$cyl == 4), "mpg"]
#>  [1] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4
subset(mtcars$mpg, mtcars$cyl == 4)
#>  [1] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4

Exploring data

#summary stats from pastecs package
pastecs::stat.desc(mtcars)
#>                      mpg         cyl         disp           hp
#> nbr.val       32.0000000  32.0000000 3.200000e+01   32.0000000
#> nbr.null       0.0000000   0.0000000 0.000000e+00    0.0000000
#> nbr.na         0.0000000   0.0000000 0.000000e+00    0.0000000
#> min           10.4000000   4.0000000 7.110000e+01   52.0000000
#> max           33.9000000   8.0000000 4.720000e+02  335.0000000
#> range         23.5000000   4.0000000 4.009000e+02  283.0000000
#> sum          642.9000000 198.0000000 7.383100e+03 4694.0000000
#> median        19.2000000   6.0000000 1.963000e+02  123.0000000
#> mean          20.0906250   6.1875000 2.307219e+02  146.6875000
#> SE.mean        1.0654240   0.3157093 2.190947e+01   12.1203173
#> CI.mean.0.95   2.1729465   0.6438934 4.468466e+01   24.7195501
#> var           36.3241028   3.1895161 1.536080e+04 4700.8669355
#> std.dev        6.0269481   1.7859216 1.239387e+02   68.5628685
#> coef.var       0.2999881   0.2886338 5.371779e-01    0.4674077
#>                      drat          wt        qsec          vs          am
#> nbr.val       32.00000000  32.0000000  32.0000000 32.00000000 32.00000000
#> nbr.null       0.00000000   0.0000000   0.0000000 18.00000000 19.00000000
#> nbr.na         0.00000000   0.0000000   0.0000000  0.00000000  0.00000000
#> min            2.76000000   1.5130000  14.5000000  0.00000000  0.00000000
#> max            4.93000000   5.4240000  22.9000000  1.00000000  1.00000000
#> range          2.17000000   3.9110000   8.4000000  1.00000000  1.00000000
#> sum          115.09000000 102.9520000 571.1600000 14.00000000 13.00000000
#> median         3.69500000   3.3250000  17.7100000  0.00000000  0.00000000
#> mean           3.59656250   3.2172500  17.8487500  0.43750000  0.40625000
#> SE.mean        0.09451874   0.1729685   0.3158899  0.08909831  0.08820997
#> CI.mean.0.95   0.19277224   0.3527715   0.6442617  0.18171719  0.17990541
#> var            0.28588135   0.9573790   3.1931661  0.25403226  0.24899194
#> std.dev        0.53467874   0.9784574   1.7869432  0.50401613  0.49899092
#> coef.var       0.14866382   0.3041285   0.1001159  1.15203687  1.22828533
#>                     gear       carb
#> nbr.val       32.0000000 32.0000000
#> nbr.null       0.0000000  0.0000000
#> nbr.na         0.0000000  0.0000000
#> min            3.0000000  1.0000000
#> max            5.0000000  8.0000000
#> range          2.0000000  7.0000000
#> sum          118.0000000 90.0000000
#> median         4.0000000  2.0000000
#> mean           3.6875000  2.8125000
#> SE.mean        0.1304266  0.2855297
#> CI.mean.0.95   0.2660067  0.5823417
#> var            0.5443548  2.6088710
#> std.dev        0.7378041  1.6152000
#> coef.var       0.2000825  0.5742933

#Counting NULL values in column
sum(is.na(mtcars$cyl))
#> [1] 0

Prevent garbage characters when using read.csv on data exported from SQL

#SOURCE: http://stackoverflow.com/questions/24568056/rs-read-csv-prepending-1st-column-name-with-junk-text
read.csv(file = "my_file.csv", fileEncoding = "UTF-8-BOM")

List functions exported by a loaded package

head(ls("package:base"))
#> [1] "-"         "-.Date"    "-.POSIXt"  "!"         "!.hexmode" "!.octmode"

Count occurrences of unique values

table(mtcars$cyl)
#> 
#>  4  6  8 
#> 11  7 14
dplyr::count(mtcars, cyl)
#> # A tibble: 3 x 2
#>     cyl     n
#>   <dbl> <int>
#> 1     4    11
#> 2     6     7
#> 3     8    14
as.data.frame(table(mtcars$cyl))
#>   Var1 Freq
#> 1    4   11
#> 2    6    7
#> 3    8   14

dplyr::count(mtcars, cyl, gear)
#> # A tibble: 8 x 3
#>     cyl  gear     n
#>   <dbl> <dbl> <int>
#> 1     4     3     1
#> 2     4     4     8
#> 3     4     5     2
#> 4     6     3     2
#> 5     6     4     4
#> 6     6     5     1
#> 7     8     3    12
#> 8     8     5     2
xtabs(~cyl + gear, mtcars)
#>    gear
#> cyl  3  4  5
#>   4  1  8  2
#>   6  2  4  1
#>   8 12  0  2

#Get table with row and column sums
addmargins(table(mtcars$cyl,useNA = "always"))
#> 
#>    4    6    8 <NA>  Sum 
#>   11    7   14    0   32
#Get percentage table
addmargins(sort(prop.table(table(mtcars$cyl,useNA = "always"))))
#> 
#>    <NA>       6       4       8     Sum 
#> 0.00000 0.21875 0.34375 0.43750 1.00000

Check for duplicate values

length(unique(nps$email))==nrow(nps)

#pull duplicate values
nycflights13::planes %>%
        dplyr::count(tailnum) %>%
        dplyr::filter(n > 1)

nycflights13::weather %>%
        dplyr::count(year, month, day, hour, origin) %>%
        filter(n > 1)

Build and install vignettes

devtools::install(build_vignettes = TRUE)

String filtering

#filter output of names - or any other character vector
row.names(mtcars)[grepl("Merc",row.names(mtcars))]
#> [1] "Merc 240D"   "Merc 230"    "Merc 280"    "Merc 280C"   "Merc 450SE" 
#> [6] "Merc 450SL"  "Merc 450SLC"
names(mtcars)[grepl("c", names(mtcars))]
#> [1] "cyl"  "qsec" "carb"
grep("C", names(mtcars), ignore.case = TRUE, value = TRUE)
#> [1] "cyl"  "qsec" "carb"
grep("C", names(mtcars), ignore.case = TRUE, value = FALSE)
#> [1]  2  7 11

#Filter with dplyr where row cointains string
dplyr::filter(ggplot2::diamonds, grepl('Good', cut))
#> # A tibble: 16,988 x 10
#>    carat cut       color clarity depth table price     x     y     z
#>    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
#>  1  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
#>  2  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
#>  3  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
#>  4  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
#>  5  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
#>  6  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
#>  7  0.3  Good      J     SI1      64      55   339  4.25  4.28  2.73
#>  8  0.3  Good      J     SI1      63.4    54   351  4.23  4.29  2.7 
#>  9  0.3  Good      J     SI1      63.8    56   351  4.23  4.26  2.71
#> 10  0.3  Very Good J     SI1      62.7    59   351  4.21  4.27  2.66
#> # ... with 16,978 more rows

#Filter column/attribute of data frame for regex text pattern match
nycflights13::airports %>%
  dplyr::filter(stringr::str_detect(name, "County")) %>% 
  head()
#> # A tibble: 6 x 8
#>   faa   name                      lat    lon   alt    tz dst   tzone       
#>   <chr> <chr>                   <dbl>  <dbl> <dbl> <dbl> <chr> <chr>       
#> 1 0G6   Williams County Airport  41.5  -84.5   730    -5 A     America/New~
#> 2 0S9   Jefferson County Intl    48.1 -123.    108    -8 A     America/Los~
#> 3 0W3   Harford County Airport   39.6  -76.2   409    -5 A     America/New~
#> 4 17G   Port Bucyrus-Crawford ~  40.8  -83.0  1003    -5 A     America/New~
#> 5 19A   Jackson County Airport   34.2  -83.6   951    -5 U     America/New~
#> 6 24J   Suwannee County Airport  30.3  -83.0   104    -5 A     America/New~

#Search for, and extract matches
pattern <- "[[:upper:]]"
stringr::sentences %>%
        stringr::str_subset(pattern) %>%
        stringr::str_extract_all(pattern) %>% 
  head()
#> [[1]]
#> [1] "T"
#> 
#> [[2]]
#> [1] "G"
#> 
#> [[3]]
#> [1] "I"
#> 
#> [[4]]
#> [1] "T"
#> 
#> [[5]]
#> [1] "R"
#> 
#> [[6]]
#> [1] "T"

#filter column/attriubte names based on string
ggplot2::diamonds %>%
  dplyr::select_at(dplyr::vars(dplyr::contains('c')))
#> # A tibble: 53,940 x 5
#>    carat cut       color clarity price
#>    <dbl> <ord>     <ord> <ord>   <int>
#>  1 0.23  Ideal     E     SI2       326
#>  2 0.21  Premium   E     SI1       326
#>  3 0.23  Good      E     VS1       327
#>  4 0.290 Premium   I     VS2       334
#>  5 0.31  Good      J     SI2       335
#>  6 0.24  Very Good J     VVS2      336
#>  7 0.24  Very Good I     VVS1      336
#>  8 0.26  Very Good H     SI1       337
#>  9 0.22  Fair      E     VS2       337
#> 10 0.23  Very Good H     VS1       338
#> # ... with 53,930 more rows

#filter where attribute value starts with a string
mtcars %>% 
    tibble::rownames_to_column("model") %>% 
    dplyr::filter(stringr::str_detect(model, "^Merc"))
#>         model  mpg cyl  disp  hp drat   wt qsec vs am gear carb
#> 1   Merc 240D 24.4   4 146.7  62 3.69 3.19 20.0  1  0    4    2
#> 2    Merc 230 22.8   4 140.8  95 3.92 3.15 22.9  1  0    4    2
#> 3    Merc 280 19.2   6 167.6 123 3.92 3.44 18.3  1  0    4    4
#> 4   Merc 280C 17.8   6 167.6 123 3.92 3.44 18.9  1  0    4    4
#> 5  Merc 450SE 16.4   8 275.8 180 3.07 4.07 17.4  0  0    3    3
#> 6  Merc 450SL 17.3   8 275.8 180 3.07 3.73 17.6  0  0    3    3
#> 7 Merc 450SLC 15.2   8 275.8 180 3.07 3.78 18.0  0  0    3    3

#rename columns/attributes if they contain certain pattern
#in this case replace '.' with '_'
#SOU
nycflights13::weather %>% 
  dplyr::rename_at(
    dplyr::vars(dplyr::contains('_')), dplyr::funs(stringr::str_replace(., "_", "\\."))
    ) %>% 
  names()
#> Warning: funs() is soft deprecated as of dplyr 0.8.0
#> Please use a list of either functions or lambdas: 
#> 
#>   # Simple named list: 
#>   list(mean = mean, median = median)
#> 
#>   # Auto named with `tibble::lst()`: 
#>   tibble::lst(mean, median)
#> 
#>   # Using lambdas
#>   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#> This warning is displayed once per session.
#>  [1] "origin"     "year"       "month"      "day"        "hour"      
#>  [6] "temp"       "dewp"       "humid"      "wind.dir"   "wind.speed"
#> [11] "wind.gust"  "precip"     "pressure"   "visib"      "time.hour"

String filtering when using a database and dbplyr

The previous mentions won’t always work when filtering against data in a database. Reference: https://github.com/tidyverse/dplyr/issues/3090

Solution from: https://stackoverflow.com/questions/38962585/pass-sql-functions-in-dplyr-filter-function-on-database/47198795#47198795

#load some data into a temp database
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
DBI::dbWriteTable(con, "airports", nycflights13::airports)

#query the data - name contains "Island" - this is case sensitive
dplyr::tbl(con, "airports") %>% 
  dplyr::filter(name %like% "%Island%") %>% 
  head(5)
#> # Source:   lazy query [?? x 8]
#> # Database: sqlite 3.29.0 []
#>   faa   name                      lat    lon   alt    tz dst   tzone       
#>   <chr> <chr>                   <dbl>  <dbl> <dbl> <dbl> <chr> <chr>       
#> 1 09J   Jekyll Island Airport    31.1  -81.4    11    -5 A     America/New~
#> 2 2B2   Plum Island Airport      42.8  -70.8    11    -5 A     America/New~
#> 3 ANN   Annette Island           55.0 -132.    119    -9 A     America/Anc~
#> 4 BID   Block Island State Air~  41.2  -71.6   108    -5 A     America/New~
#> 5 BRO   Brownsville South Padr~  25.9  -97.4    22    -6 A     America/Chi~

This works when I test against a Microsoft SQL database but not SQLite. Returns error: Error in stri_detect_regex(string, pattern, opts_regex = opts(pattern)) : object 'name' not found

#load some data into a temp database
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
DBI::dbWriteTable(con, "airports", nycflights13::airports)

#query the data - name contains "Island" - this is case sensitive
dplyr::tbl(con, "airports") %>% 
  dplyr::filter(stringr::str_detect(name, "Island")) %>% 
  head(5)

Create permutations of items from multple vectors

expand.grid(
    c(1, 2),
    c(3, 4),
    c(2, 3))
#>   Var1 Var2 Var3
#> 1    1    3    2
#> 2    2    3    2
#> 3    1    4    2
#> 4    2    4    2
#> 5    1    3    3
#> 6    2    3    3
#> 7    1    4    3
#> 8    2    4    3

Get class names of items in a dataframe/vector

sapply(mtcars, class)
#>       mpg       cyl      disp        hp      drat        wt      qsec 
#> "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
#>        vs        am      gear      carb 
#> "numeric" "numeric" "numeric" "numeric"

Printing more than default rows/columns from a table

mtcars %>% dplyr::as_tibble() %>% print(n=15, width = Inf)
#> # A tibble: 32 x 11
#>      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
#>  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
#>  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
#>  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
#>  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
#>  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
#>  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
#>  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
#>  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
#> 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
#> 11  17.8     6  168.   123  3.92  3.44  18.9     1     0     4     4
#> 12  16.4     8  276.   180  3.07  4.07  17.4     0     0     3     3
#> 13  17.3     8  276.   180  3.07  3.73  17.6     0     0     3     3
#> 14  15.2     8  276.   180  3.07  3.78  18       0     0     3     3
#> 15  10.4     8  472    205  2.93  5.25  18.0     0     0     3     4
#> # ... with 17 more rows

Search all objects, including functions, in global environment for string

apropos("cars")
#> [1] "cars"   "mtcars"

Search key words or phrases in help pages, vignettes or task views

RSiteSearch("Microsoft AND SQL")

Update packages after updating R

update.packages(ask = FALSE, checkBuilt = TRUE)

Timezone stuff

#Get timezone
Sys.timezone()
#> [1] "America/New_York"

#Get all timezones
OlsonNames() %>% head()
#> [1] "Africa/Abidjan"     "Africa/Accra"       "Africa/Addis_Ababa"
#> [4] "Africa/Algiers"     "Africa/Asmara"      "Africa/Asmera"

Calculate minutes between two times

library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date
ymd_hms("2018-10-25 12:00:00") %--% ymd_hms("2018-10-25 13:00:00") / dminutes(1)
#> [1] 60

filter for records within last n years

works with dbplyr too this is not a great example but I wanted it to work for at least a few more years.

hundred_years_ago <- lubridate::today() - lubridate::years(100)

nycflights13::flights %>% 
  dplyr::filter(time_hour >= hundred_years_ago)
#> # A tibble: 336,776 x 19
#>     year month   day dep_time sched_dep_time dep_delay arr_time
#>    <int> <int> <int>    <int>          <int>     <dbl>    <int>
#>  1  2013     1     1      517            515         2      830
#>  2  2013     1     1      533            529         4      850
#>  3  2013     1     1      542            540         2      923
#>  4  2013     1     1      544            545        -1     1004
#>  5  2013     1     1      554            600        -6      812
#>  6  2013     1     1      554            558        -4      740
#>  7  2013     1     1      555            600        -5      913
#>  8  2013     1     1      557            600        -3      709
#>  9  2013     1     1      557            600        -3      838
#> 10  2013     1     1      558            600        -2      753
#> # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
#> #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>, time_hour <dttm>

View data exported by a package

data(package = "ggplot2")

Browse vignettes for a given package

browseVignettes(package = "dplyr")

Open function documentation from w/in RStudio

Use F2

Create a dataframe with random data

#the call to set.seed ensures anyone who runs the code will get the same random number stream. 
set.seed(1)
data.frame(x = rnorm(4), y = rnorm(4), z = sample(LETTERS, 4))
#>            x          y z
#> 1 -0.6264538  0.3295078 V
#> 2  0.1836433 -0.8204684 N
#> 3 -0.8356286  0.4874291 J
#> 4  1.5952808  0.7383247 G

Details about built-in data sets

library(help = "datasets")
data()

Use Github to search for packages using a particular function

Put this into the GitHub search box to see how packages on CRAN use the llply() function from plyr

 "llply" user:cran language:R

Conditional mutate

NOTE: should probably also look at recode here

nycflights13::airlines %>%
    dplyr::mutate(
        name = dplyr::case_when(
            name == "Virgin America" ~ "Alaska Airlines Inc.",
            TRUE ~ name)
    )
#> # A tibble: 16 x 2
#>    carrier name                       
#>    <chr>   <chr>                      
#>  1 9E      Endeavor Air Inc.          
#>  2 AA      American Airlines Inc.     
#>  3 AS      Alaska Airlines Inc.       
#>  4 B6      JetBlue Airways            
#>  5 DL      Delta Air Lines Inc.       
#>  6 EV      ExpressJet Airlines Inc.   
#>  7 F9      Frontier Airlines Inc.     
#>  8 FL      AirTran Airways Corporation
#>  9 HA      Hawaiian Airlines Inc.     
#> 10 MQ      Envoy Air                  
#> 11 OO      SkyWest Airlines Inc.      
#> 12 UA      United Air Lines Inc.      
#> 13 US      US Airways Inc.            
#> 14 VX      Alaska Airlines Inc.       
#> 15 WN      Southwest Airlines Co.     
#> 16 YV      Mesa Airlines Inc.

extracting nested list into a tibble

SOURCE: https://cfss.uchicago.edu/webdata004_simplifying_lists.html

my_list <- list(
    list(first = "nick", last = "vasile", weight = 170),
    list(first = "bob", last = "smith", weight = 150)
)

my_list
#> [[1]]
#> [[1]]$first
#> [1] "nick"
#> 
#> [[1]]$last
#> [1] "vasile"
#> 
#> [[1]]$weight
#> [1] 170
#> 
#> 
#> [[2]]
#> [[2]]$first
#> [1] "bob"
#> 
#> [[2]]$last
#> [1] "smith"
#> 
#> [[2]]$weight
#> [1] 150

purrr::map_df(my_list, magrittr::extract)
#> # A tibble: 2 x 3
#>   first last   weight
#>   <chr> <chr>   <dbl>
#> 1 nick  vasile    170
#> 2 bob   smith     150

Scatterplot of all pairs

pairs(mtcars)

Interactively explore plots

identify(mtcars$hp, mtcars$mpg, mtcars$mpg)

Sort bar plot by counts

nycflights13::flights %>% 
    ggplot2::ggplot(ggplot2::aes(x=forcats::fct_infreq(carrier))) +
    ggplot2::geom_bar()

nycflights13::flights %>%
  ggplot2::ggplot(ggplot2::aes(x=forcats::fct_rev(
    forcats::fct_infreq(carrier)))) +
  ggplot2::geom_bar() +
  ggplot2::coord_flip()

Sort bar plot by counts - when using stat = “identity”

avg_delay_by_carrier <- nycflights13::flights %>% 
  dplyr::group_by(carrier) %>% 
  dplyr::summarize(mean_delay = mean(dep_delay, na.rm = TRUE))

avg_delay_by_carrier %>% 
  ggplot2::ggplot(ggplot2::aes(x=reorder(carrier, -mean_delay),
                               y = mean_delay)) +
  ggplot2::geom_bar(stat="identity") +
  ggplot2::labs(x = "carrier") # if you want to set x-axis to original attribute

avg_delay_by_carrier <- nycflights13::flights %>% 
  dplyr::group_by(carrier) %>% 
  dplyr::summarize(mean_delay = mean(dep_delay, na.rm = TRUE))

avg_delay_by_carrier %>% 
  ggplot2::ggplot(ggplot2::aes(x=reorder(carrier, mean_delay),
                               y = mean_delay)) +
  ggplot2::geom_bar(stat="identity") +
  ggplot2::coord_flip()

Color coded correlation table

M <- cor(mtcars)
corrplot::corrplot(M, method="color")

with clustering

corrplot::corrplot(M, order = "hclust")

filter correlations at a cutoff value

caret::findCorrelation(M, cutoff = 0.2, verbose = TRUE, names = TRUE)
#> Compare row 2  and column  3 with corr  0.902 
#>   Means:  0.701 vs 0.546 so flagging column 2 
#> Compare row 3  and column  1 with corr  0.848 
#>   Means:  0.658 vs 0.513 so flagging column 3 
#> Compare row 1  and column  6 with corr  0.868 
#>   Means:  0.63 vs 0.483 so flagging column 1 
#> Compare row 6  and column  4 with corr  0.659 
#>   Means:  0.543 vs 0.446 so flagging column 6 
#> Compare row 4  and column  8 with corr  0.723 
#>   Means:  0.5 vs 0.418 so flagging column 4 
#> Compare row 8  and column  5 with corr  0.44 
#>   Means:  0.426 vs 0.394 so flagging column 8 
#> Compare row 5  and column  9 with corr  0.713 
#>   Means:  0.399 vs 0.365 so flagging column 5 
#> Compare row 9  and column  10 with corr  0.794 
#>   Means:  0.36 vs 0.352 so flagging column 9 
#> Compare row 10  and column  11 with corr  0.274 
#>   Means:  0.243 vs 0.339 so flagging column 11 
#> Compare row 10  and column  7 with corr  0.213 
#>   Means:  0.213 vs 0.213 so flagging column 7 
#> All correlations <= 0.2
#>  [1] "cyl"  "disp" "mpg"  "wt"   "hp"   "vs"   "drat" "am"   "carb" "qsec"

I dont really like the formatting here - too hard to match columns - let’s find a better way.

There is corrr::correlations() but not available on CRAN and there is also an open issue with the newest version of dplyr.

Using deprecrated reshape2 package because tidyr doesn’t handle matrixs. This is fine for now.

reshape2::melt(M) %>%
  dplyr::filter(abs(value) > 0.2, Var1 != Var2) %>%
  dplyr::arrange(desc(abs(value))) %>%
  head()
#>   Var1 Var2      value
#> 1 disp  cyl  0.9020329
#> 2  cyl disp  0.9020329
#> 3   wt disp  0.8879799
#> 4 disp   wt  0.8879799
#> 5   wt  mpg -0.8676594
#> 6  mpg   wt -0.8676594

plot distribution of all variables

Just the numeric variables

SOURCE: (https://drsimonj.svbtle.com/quick-plot-of-all-variables)

mtcars %>%
    purrr::keep(is.numeric) %>% 
    tidyr::gather() %>% 
    ggplot2::ggplot(ggplot2::aes(value)) + 
    ggplot2::facet_wrap(~key, scales = "free") + 
    ggplot2::geom_histogram()
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

all variables

nycflights13::planes[,c(4,9)] %>%
  tidyr::gather() %>% 
  ggplot2::ggplot(ggplot2::aes(value)) + 
  ggplot2::facet_wrap(~key, scales = "free") + 
  ggplot2::geom_bar(stat = "count") + 
  ggplot2::coord_flip ()

RMarkdown: insert date document was knitted

add this to the header:

SOURCE: (https://stackoverflow.com/questions/23449319/yaml-current-date-in-rmarkdown)

Plot percentage of attributes that are NA for each outcome

data(Soybean, package = "mlbench")

# the first column is the outcome variable so we remove it
na_rates <- rowMeans(is.na(Soybean[,-1]))

soybean <- Soybean %>% 
  tibble::add_column(na_rate = na_rates)

soybean %>% 
    dplyr::group_by(Class) %>% 
    dplyr::summarize(mean_na_rate = mean(na_rate)) %>% 
    ggplot2::ggplot(ggplot2::aes(x=Class, y = mean_na_rate, fill = Class)) +
    ggplot2::geom_bar(stat = "identity") +
    ggplot2::coord_flip() +
    ggplot2::theme(legend.position = "none")

Plot the pecentage of rows that has at least 1 NA attribute, by outcome

data(Soybean, package = "mlbench")

na_rates <- rowMeans(is.na(Soybean[,-1]))

soybean <- Soybean %>%
    tibble::add_column(na_rates = na_rates) %>% 
    dplyr::mutate(has_nas = na_rates != 0 )

soybean %>% 
    dplyr::group_by(Class) %>% 
    dplyr::summarize(mean_has_nas = mean(has_nas)) %>% 
    ggplot2::ggplot(ggplot2::aes(x=Class, y = mean_has_nas, fill = Class)) +
    ggplot2::geom_bar(stat = "identity") +
    ggplot2::coord_flip() +
    ggplot2::theme(legend.position = "none")

Plot the attributes (predictors) that are most likely to be missing

data(Soybean, package = "mlbench")

predictor_na_rate <- data.frame(colMeans(is.na(Soybean[,-1]))) %>% 
    tibble::rownames_to_column(var = "predictor") %>% 
    dplyr::rename( na_rate = 2)

predictor_na_rate %>% ggplot2::ggplot(ggplot2::aes(x = predictor, y = na_rate,
                                                   fill = predictor )) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::coord_flip() +
  ggplot2::theme(legend.position = "none")

Plot the attributes (predictors) that are most likely to be missing, by outcome

data(Soybean, package = "mlbench")

soybean_l <- Soybean %>% 
  tidyr::gather(-Class, key = "predictor",value = "value")
#> Warning: attributes are not identical across measure variables;
#> they will be dropped

soybean_l %>% dplyr::group_by(Class, predictor) %>% 
  dplyr::summarize(na_rate = mean(is.na(value))) %>% 
  ggplot2::ggplot(ggplot2::aes(x=predictor, y = na_rate, fill = predictor)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::coord_flip() +
  ggplot2::facet_wrap(~Class, scales = "free") +
  ggplot2::theme(legend.position = "none")

Create a matrix that shows whether or not a particular combination of values is in the data

SOURCE: https://stackoverflow.com/a/37897416

mtcars %>% 
  tibble::rownames_to_column("name") %>% 
  dplyr::distinct(cyl, gear, has_gear = !is.na(name)) %>% 
  tidyr::complete( cyl, gear) %>% 
  dplyr::mutate(has_gear = dplyr::if_else(is.na(has_gear),FALSE, TRUE)) %>% 
  ggplot2::ggplot(ggplot2::aes(x=cyl, y =gear )) +
  ggplot2::geom_tile(ggplot2::aes(fill = has_gear),color = "white")

Convert unix style epoch time to human readable time

epoch_time <- 1505249866329

lubridate::as_datetime(epoch_time/1000)
#> [1] "2017-09-12 20:57:46 UTC"

Clean Data: remove columns where no rows contain a value


df <-data.frame(col_1 = c(NA, 1, 2, 3),
                col_2 = c(NA, NA, NA, NA),
                col_3 = c(1, 2, 3, 4)) 
  
df
#>   col_1 col_2 col_3
#> 1    NA    NA     1
#> 2     1    NA     2
#> 3     2    NA     3
#> 4     3    NA     4
  
case_missing_rate <- data.frame(colMeans(is.na(df))) %>%
        tibble::rownames_to_column(var = "column") %>%
        dplyr::rename(missing_rate = 2)

    all_missing_cols <- case_missing_rate %>%
        dplyr::filter(missing_rate == 1.0) %>%
        dplyr::pull(column)

    df %>%
        dplyr::select(-all_missing_cols)
#>   col_1 col_3
#> 1    NA     1
#> 2     1     2
#> 3     2     3
#> 4     3     4

One plot for each attribute in a data frame - different scale for each attribute

datasets::airquality %>% 
  dplyr::filter(Month == 5) %>% 
  dplyr::select(-Month) %>% 
  tidyr::pivot_longer(-Day, names_to = "attribute", values_to = "value") %>% 
    ggplot2::ggplot(ggplot2::aes(Day, value)) +
    ggplot2::geom_line() +
    ggplot2::facet_grid(attribute ~ ., scales = "free")