THE BENEFIT

choosing big machines with lots of CPU and a ton of RAM can be addictive. Winners know when to stop

devtools::install_github("cloudyr/aws.ec2")  
library(aws.ec2)  # Describe the AMI (from: http://www.louisaslett.com/RStudio_AMI/)  aws.signature::locate_credentials()  image <- "ami-3b0c205e"  describe_images(image)  
AWS_ACCESS_KEY_ID=F8D6E9131F0E0CE508126  AWS_SECRET_ACCESS_KEY=AAK53148eb87db04754+f1f2c8b8cae222a2  AWS_DEFAULT_REGION=us-east-2  
source("https://bit.ly/2KnkdzV")  
image <- "ami-3b0c205e"  aws_describe <- describe_images(image)  aws_describe %>% tidy_describe(.)  
--------------------------------------                  Summary                 --------------------------------------  imageId : ami-3b0c205e   imageOwnerId : 732690581533   creationDate : 2017-10-17T09:28:45.000Z   name : RStudio-1.1.383_R-3.4.2_Julia-0.6.0_CUDA-8_cuDNN-6_ubuntu-16.04-LTS-64bit   description : Ready to run RStudio + Julia/Python server for statistical computation (www.louisaslett.com). Connect to instance public DNS in web brower (standard port 80), username rstudio and password rstudio     To return as tibble: pretty_print = FALSE  
s <- describe_subnets()  g <- describe_sgroups()  
# Launch the instance using appropriate settings  i <- run_instances(image = image,                      type = "t2.micro", # <- you might want to change this to something like x1e.32xlarge ($26.688 p/h) if you feeling adventurous                     subnet = s[[1]],                      sgroup = g[[1]])  
aws_instance <- describe_instances(i)  aws_instance %>% tidy_describe()  
--------------------------------------                  Summary                 --------------------------------------  ownerId : 748485365675   instanceId : i-007fd9116488691fe   imageId : ami-3b0c205e   instanceType : t2.micro   launchTime : 2018-06-30T13:15:50.000Z   availabilityZone : us-east-2b   privateIpAddress : 172.31.16.198   ipAddress : 18.222.174.186   coreCount : 1   threadsPerCore : 1   To return as tibble: pretty_print = FALSE  
aws_status <- instance_status(i)  aws_status %>% tidy_describe()  
--------------------------------------                  Summary                 --------------------------------------  instanceId : i-007fd9116488691fe   availabilityZone : us-east-2b   code : 16   name : running     To return as tibble: pretty_print = FALSE  
# Stop and terminate the instances  stop_instances(i[[1]])  terminate_instances(i[[1]])  
bench.fct <- function(N = 2500000, type.file = 'rds', type.hd = 'HDD') {    # Function for timing read and write operations    #    # INPUT: N - Number of rows in dataframe to be read and write    #        type.file - format of output file (rds, csv, fst)    #        type.hd - where to save (hdd or ssd)    #    # OUTPUT: A dataframe with results    require(tidyverse)    require(fst)        my.df <- data_frame(x = runif(N),                        char.vec = sample(letters, size = N,                                           replace = TRUE))        path.file <- switch(type.hd,                        'SSD' = '~',                        'HDD' = '/mnt/HDD/')        my.file <- file.path(path.file,                          switch (type.file,                                 'rds-base' = 'temp_rds.rds',                                 'rds-readr' = 'temp_rds.rds',                                 'fst' = 'temp_fst.fst',                                 'csv-readr' = 'temp_csv.csv',                                 'csv-base' = 'temp_csv.csv'))        if (type.file == 'rds-base') {      time.write <- system.time(saveRDS(my.df, my.file, compress = FALSE))      time.read <- system.time(readRDS(my.file))    } else if (type.file == 'rds-readr') {      time.write <- system.time(write_rds(x = my.df, path =  my.file, compress = 'none'))      time.read <- system.time(read_rds(path = my.file ))    } else if (type.file == 'fst') {      time.write <- system.time(write.fst(x = my.df, path = my.file))      time.read <- system.time(read_fst(my.file))    } else if (type.file == 'csv-readr') {      time.write <- system.time(write_csv(x = my.df, path = my.file))      time.read <- system.time(read_csv(file = my.file, col_types = cols(x = col_double(),                                                                         char.vec = col_character())))    } else if (type.file == 'csv-base') {      time.write <- system.time(write.csv(x = my.df, file = my.file))      time.read <- system.time(read.csv(file = my.file))    }        # clean up    file.remove(my.file)        # save output    df.out <- data_frame(type.file = type.file,                         type.hd = type.hd,                         N = N,                         type.time = c('write',                                        'read'),                         times = c(time.write[3],                                    time.read[3]))        return(df.out)      }  
library(purrr)  df.grid <- expand.grid(N = seq(1, 500000, by = 50000),                          type.file = c('rds-readr', 'rds-base', 'fst', 'csv-readr', 'csv-base'),                          type.hd = c('HDD', 'SSD'), stringsAsFactors = F)    l.out <- pmap(list(N = df.grid$N,                 type.file = df.grid$type.file,                 type.hd = df.grid$type.hd), .f = bench.fct)    df.res <- do.call(what = bind_rows, args = l.out)  
library(ggplot2)    p <- ggplot(df.res, aes(x = N, y = times, linetype = type.hd)) +     geom_line() + facet_grid(type.file ~ type.time)    print(p)  
library(ggplot2)    p <- ggplot(filter(df.res, !(type.file %in% c('csv-base'))),              aes(x = N, y = times, linetype = type.hd)) +     geom_line() + facet_grid(type.file ~ type.time)    print(p)  
tab <- df.res %>%    group_by(type.file, type.time) %>%    summarise(mean.HDD = mean(times[type.hd == 'HDD']),              mean.SSD = mean(times[type.hd == 'SSD']),              p.value = t.test(times[type.hd == 'SSD'],                               times[type.hd == 'HDD'])$p.value)      print(tab)    ## # A tibble: 10 x 5  ## # Groups:   type.file [?]  ##    type.file type.time mean.HDD mean.SSD p.value  ##                          ##  1 csv-base  read       0.554    0.463    0.605   ##  2 csv-base  write      0.405    0.405    0.997   ##  3 csv-readr read       0.142    0.126    0.687   ##  4 csv-readr write      0.0711   0.0706   0.982   ##  5 fst       read       0.015    0.0084   0.0584  ##  6 fst       write      0.00900  0.00910  0.964   ##  7 rds-base  read       0.0321   0.0303   0.848   ##  8 rds-base  write      0.0253   0.025    0.969   ##  9 rds-readr read       0.0323   0.0304   0.845   ## 10 rds-readr write      0.0251   0.0247   0.957

THE BENEFIT

Search This Blog

[R-bloggers] Interacting with AWS from R (and 4 more aRticles)

[R-bloggers] Interacting with AWS from R (and 4 more aRticles)

Getting set up

Release the beast

Final comments

Changes in RcppArmadillo version 0.8.600.0.0 (2018-06-28)

Changes in RcppArmadillo version 0.8.500.1.1 (2018-05-17) [GH only]

Comments

Post a Comment