1 Uses of ggplot2

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

2 Data Transformations

# filter(mpg, model == "a4" & year > 2000)
# select(mpg, model:cyl)
# select(mpg, -(model:cyl))
# arrange(mpg, year, cty)
# mutate(mpg, ctyvshwy = cty - hwy)
# mutate(mpg, ctyvshwy = cty - hwy) %>% select(model, cty, hwy, ctyvshwy) %>% filter(ctyvshwy > -3)
# transmute(mpg, )

if (!require("nycflights13")) install.packages("nycflights13")
library(nycflights13)

delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")

ggplot(delays, aes(dist, delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE, span = 0.6)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# group_by(mpg, manufacturer) %>% summarise(count = n(), mean_hwy = median(hwy, na.rm = TRUE), mean_cty = mean(cty, na.rm = TRUE), diff = mean(hwy) - mean(cty)) %>% arrange(desc(mean_hwy)) %>% filter(count %in% c(1,3,6,9) | count > 15)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# var = mtcars %>% group_by(cyl) %>% summarise(count = n(), mean_mpg = mean(mpg), mean_hp = mean(hp), median_wt = median(wt))

# ggplot(var, aes(cyl,mean_mpg)) + geom_point(aes(size = mean_hp, color = median_wt))

tb <- tribble(
  ~index, ~size, ~wt,
  "1",5,10.1,
  "2",3,11.4,
  "3",2,9.98
)

tibble(
  a = lubridate::now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1:1e3,
  d = runif(1e3),
  e = sample(letters, 1e3, replace = TRUE)
)

df <- tibble(
  x = runif(5),
  y = rnorm(5)
)
df

ggplot(df,aes(x, y)) +
  geom_point(aes(color = x))

df$x

## [1] 0.3131381 0.4497158 0.2151312 0.6482134 0.5434941

df[["x"]]

## [1] 0.3131381 0.4497158 0.2151312 0.6482134 0.5434941

df[[1]]

## [1] 0.3131381 0.4497158 0.2151312 0.6482134 0.5434941

df %>% .$x

## [1] 0.3131381 0.4497158 0.2151312 0.6482134 0.5434941

df %>% .[["x"]]

## [1] 0.3131381 0.4497158 0.2151312 0.6482134 0.5434941

3 Reading Files

read_csv("first line\n1,2,3\n4,.,6\n1,0,1", skip = 1, col_names = c("x", "y", "z"), na = ".")

4 Parsing a Vector

str(parse_date(c("2010-01-01", '2019-01-01', '1990-01-01','.'), na = "."))

##  Date[1:4], format: "2010-01-01" "2019-01-01" "1990-01-01" NA

str(parse_character(c("a","b")))

##  chr [1:2] "a" "b"

parse_double("1,23", locale = locale(decimal_mark = ","))

## [1] 1.23

parse_number(c("$100","20%","40 dollars", "#50"))

## [1] 100  20  40  50

charToRaw("Leriche")

## [1] 4c 65 72 69 63 68 65

charToRaw(".")

## [1] 2e

t <- charToRaw("b")
rawToChar(t)

## [1] "b"

library(hms)
parse_time("01:10 am")

## 01:10:00

guess_parser("2018-10-04")

## [1] "date"

5 Exporting/Importing Files

# write_excel_csv()

# write_csv(data, "filename.csv")

# readxl()

# haven()  reads spss, stata and SAS files.

# gather()

table4a <- tribble(
  ~"country",~`1999`,~`2000`,
  "USA", 444, 888,
  "BRA", 232, 458
)

table4a %>% 
  gather(`1999`, `2000`, key = "year", value = "cases")

# spread()

table2 <- tribble(
  ~"country",~"year",~"type",~"count",
  "USA", 1999, "cases", 759,
  "USA", 1999, "population", 87432,
 "USA", 2000, "cases", 888,
  "USA", 2000, "population", 499843
)

table2

(table2 %>%
  spread(key = type, value = count))

# separate()

table3 <- tribble(
  ~"country",~"year",~"rate",
  "USA", 2000, "586/58943",
  "USA", 2001, "588/76633",
  "FRA", 2001, "45323/12345"
)

table5 <- table3 %>%
  separate(rate, into = c("cases","population"), convert = TRUE) %>%
  separate(year, into = c("century", "year"), sep = 2)

# unite(new, century, year)

table5 %>%
  unite(new, century, year, sep = "")

stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

stocks %>%
  spread(qtr, return)

stocks %>%
  spread(year, return) %>%
  gather(year, return, `2015`:`2016`, na.rm = TRUE)

# complete()

stocks %>%
  complete(year, qtr)

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

treatment %>%
  fill(person)

who1 <- who %>%
  gather(new_sp_m014:newrel_f65, key = "key", value = "cases", na.rm = TRUE)

who1 %>%
  count(key)

who2 <- who1 %>%
  mutate(key = stringr::str_replace(key, "newrel", "new_rel"))

who2

who3 <- who2 %>%
  separate(key,c("new","type","sexage"), sep = "_")

who3

who3 %>%
  count(new)

who4 <- who3 %>%
  select(-new, -iso2, -iso3)

who5 <- who4 %>%
  separate(sexage, c("sex","age"), sep = 1)

who5

planes %>%
  count(tailnum) %>%
  filter(n>1)

weather %>%
  count(year, month, day, hour, origin) %>%
  filter(n>1)

flights2 <- flights %>%
  select(year:day, hour, origin, dest, tailnum, carrier)

flights2

flights2 %>%
  select(-origin, -dest) %>%
  left_join(airlines, by = "carrier")

flights2 %>%
  select(-origin, -dest) %>%
  mutate(name = airlines$name[match(carrier, airlines$carrier)])

flights2 %>%
  left_join(airports, c("dest" = "faa")) %>%
  arrange(origin)

flights2 %>%
  left_join(airports, c("origin" = "faa")) %>%
  arrange(origin)

airports %>%
  semi_join(flights, c("faa" = "dest")) %>%
  filter(lon > -140) %>%
  ggplot(aes(lon, lat)) +
    borders("state") +
    geom_point(aes(color = alt)) +
    coord_quickmap()

(top_dest <- flights %>%
  count(dest, sort = TRUE) %>%
  head(10))

top_dest <- flights %>%
  count(dest, sort = TRUE) %>%
  left_join(airports, c("dest" = "faa"))

top_dest

flights %>%
  filter(dest %in% top_dest$dest)

6 Strings

x <- "\u00b5"
x

## [1] "µ"

string <- c("x","y","z")
string

## [1] "x" "y" "z"

str_c("x","y","z")

## [1] "xyz"

str_c("x","y","z", sep = ", ")

## [1] "x, y, z"

name <- "Tomas"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)

## [1] "Good morning Tomas."

str_c(c("x", "y", "z"), collapse = ":")

## [1] "x:y:z"

x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 2)

## [1] "Ap" "Ba" "Pe"

str_sub(x, -3, -1)

## [1] "ple" "ana" "ear"

str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x

## [1] "apple"  "banana" "pear"

x <- c("x","d","y","u","q","a","b")

str_sort(x)

## [1] "a" "b" "d" "q" "u" "x" "y"

x <- c("xylophone","apples","pears","chicken","pork","ate","bee")

str_view(x, "a")

7 Determine Matches

writeLines(x)

## xylophone
## apples
## pears
## chicken
## pork
## ate
## bee

str_detect(x, "e")

## [1]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE

sum(str_detect(words, "[aeiou]$"))

## [1] 271

mean(str_detect(words, "[aeiou]$"))

## [1] 0.2765306

no_vowels_1 <- !str_detect(words, "[aeiou]")

no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)

## [1] TRUE

8 Models

if (!require("modelr")) install.packages("modelr")
library(modelr)
options(na.action = na.warn)

ggplot(sim1, aes(x, y)) +
  geom_point()

models <- tibble(
  a1 = runif(250, -20, 40),
  a2 = runif(250, -5, 5)
)

ggplot(sim1,aes(x,y)) +
  geom_abline(aes(intercept=a1,slope=a2), models, alpha = 1/4) +
  geom_point()

model1 <- function(a, data) {
  a[1] + data$x * a[2]
}

model1(c(5,5), sim1)

##  [1] 10 10 10 15 15 15 20 20 20 25 25 25 30 30 30 35 35 35 40 40 40 45 45
## [24] 45 50 50 50 55 55 55

9 Root-mean-squared deviation

measure_dist <- function(mod, data) {
  diff <- data$y - model1(mod, data)
  diff^2 %>%
    mean() %>%
    sqrt()
}
measure_dist(c(5,5), sim1)

## [1] 19.10774

10 Fitting linear models with lm()

sim1_mod <- lm(y ~ x, data = sim1)

coef(sim1_mod)

## (Intercept)           x 
##    4.220822    2.051533

sim1a <- tibble(
  x = rep(1:10, each = 3),
  y = x * 1.5 + 6 + rt(length(x), df = 2)
)

sim1a_mod <- lm(y ~ x, data = sim1a)

coef(sim1a_mod)

## (Intercept)           x 
##    5.694273    1.639591

grid <- sim1 %>%
  data_grid(x)
grid

grid <- grid %>%
  add_predictions(sim1_mod)
grid

11 Prime Find Func

primary <- function(x) {
  if (min(x %% 2:(x-1)) > 0) {
    return(x)
  }else{
    return(0)
  }
}

array = c(1:1000)

primeArray <- matrix()

count = 0

for (x in array) {
  if (x == 1) {
    next
  }
  if (primary(x) > 0) {
    primeArray <- append(primeArray, x)
    count = count + 1
  }
  
}
primeArray <- primeArray[!is.na(primeArray)]
count

## [1] 167

primeArray

##   [1]   3   5   7  11  13  17  19  23  29  31  37  41  43  47  53  59  61
##  [18]  67  71  73  79  83  89  97 101 103 107 109 113 127 131 137 139 149
##  [35] 151 157 163 167 173 179 181 191 193 197 199 211 223 227 229 233 239
##  [52] 241 251 257 263 269 271 277 281 283 293 307 311 313 317 331 337 347
##  [69] 349 353 359 367 373 379 383 389 397 401 409 419 421 431 433 439 443
##  [86] 449 457 461 463 467 479 487 491 499 503 509 521 523 541 547 557 563
## [103] 569 571 577 587 593 599 601 607 613 617 619 631 641 643 647 653 659
## [120] 661 673 677 683 691 701 709 719 727 733 739 743 751 757 761 769 773
## [137] 787 797 809 811 821 823 827 829 839 853 857 859 863 877 881 883 887
## [154] 907 911 919 929 937 941 947 953 967 971 977 983 991 997

12 Order()

v <- c(12, 11, 5, 22, 5)

b <- c("abc", "bcv", "aab")

order(v)

## [1] 3 5 2 1 4

v <- v[order(v)]

v

## [1]  5  5 11 12 22

b <- b[order(b)]

b

## [1] "aab" "abc" "bcv"

a <- c(1,4,2,3)

order(a)

## [1] 1 3 4 2

13 Lists()

a <- matrix(1:6, nrow = 2, byrow = TRUE)


c <- list("a" = a, "b" = 5:7)

c

## $a
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## 
## $b
## [1] 5 6 7

c$b[2]

## [1] 6

c[["b"]][[2]]

## [1] 6

d <- list("e" = c, "f" = c)
d

## $e
## $e$a
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## 
## $e$b
## [1] 5 6 7
## 
## 
## $f
## $f$a
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## 
## $f$b
## [1] 5 6 7

My R Notes

1 Uses of ggplot2

2 Data Transformations

3 Reading Files

4 Parsing a Vector

5 Exporting/Importing Files

6 Strings

7 Determine Matches

8 Models

9 Root-mean-squared deviation

10 Fitting linear models with lm()

11 Prime Find Func

12 Order()

13 Lists()