##### Helicopter Data exploring & manipulation and starting graphs, D Jenkins 9 Sep 2021 # These two lines are handy to put at the start of any code rm(list=ls()) # remove ALL objects to help make space - most helpful if you have been doing other R stuff before this gc() # garbage collection - cleans those objects out of memory setwd("~/Desktop") # set working directory to where the data will be - all subsequent work will also be saved there by default # Install needed packages. This only needs to be done once - they will exist on your computer once this is done. install.packages("readxl") install.packages("tidyverse") # this actually installs several packages, including dplyr, readr, and ggplot2 install.packages("ggthemes") install.packages("viridis") # Now invoke those packages. You will need to do similar "library" or "require" commands to get needed packages every time you run R. library(readxl) # to read Excel files library(tidyverse) # to move data around with dplyr, start using pipelines ( %>% ), and graph with ggplot2 library(ggthemes) # some nice preloads of graph styles library(viridis) # a popular color scheme ### Let's practice loading the data in a couple ways. Go to the class web page and download the helicopter data.csv AND helicopter data.xls files. # Below I assume those files are now on your Desktop. # First import the .csv file using readr: copter.csv.data <- read_csv("~/Desktop/helicopter data.csv") # That works great but automatically decides the kinds of data in columns. You may find problems in data some day due to importing as the wrong types. # To customize details, go instead to Environment... Import Dataset... From Text (readr) and then Browse in the pop-up window to get the file... # Click on a column heading to change data type. For example, copter ID is not a continuous number with decimal places, but only an integer for a design type. # Change that from "double" to "integer" and click "Import" at the bottom. ### Now let's import straight from the Excel file, using readxl. # The excel file contains two sheets - one with data and one with metadata (explanations). You want the data on the first sheet copter.xl.data <- read_xls("~/Desktop/helicopter data.xls", 1) # What if you want to see the metadata while in R - can you load the second sheet? # What other options exist in readxl? Go to Packages (see right --->), scroll down to readxl, and click on the name. There is lots of info available about each package. ##### Let's squint at the data, using either the csv or the xl version. # to use only one data file hereafter, use this command (convenient but also can cause problems if you later use multiple data files in a long script): attach(copter.csv.data) # What is the average copter "hang time" for each group? groupmeans <- copter.csv.data %>% group_by(Group) %>% summarise(Group.mean = mean(Time, na.rm=T)) # Notice what that did? We grouped by student Groups, then summarized to get a mean time per group, after removing any NAs in the data. # But what did you get? groupmeans # This is a short list, but what if you wanted to sort them in order descending means (so greatest was at the top)? arrange(groupmeans, desc(Group.mean)) # How many copters were used per Design? Did we get a balanced experiment? copter.csv.data$ID <- factor(copter.csv.data$ID) # here we convert the numeric ID into a non-numeric factor designcounts <- copter.csv.data %>% group_by(ID) %>% count(WL) designcounts # What happens if we replace WL above with Time? Why is this different? # How many unique copter designs were used? number.of.designs <- length(unique(copter.csv.data$ID)) number.of.designs # How about a graph? Let's first make a simple boxplot - Time as a function of categorical Groups boxplot(copter.xl.data$Time~copter.xl.data$Group) # medians in the middle, box = quartiles, whiskers = 10 & 90 percentiles, dots = outliers # Want a prettier one? ggplot(copter.csv.data, aes(x=Group, y=Time, color = Group)) + geom_boxplot() # Wanna get rid of the gray background and change box colors? ggplot(copter.csv.data, aes(x=Group, y=Time, color = Group)) + geom_boxplot(aes(fill = Group)) + theme_classic() + scale_fill_viridis_d() # Why is there so much scatter for any one Group? # Now try some other boxplots of your own - for example, Time ~ WL. And play with different themes in ggthemes # Or how about a scatter plot between two numerical variables? plot(Time~Step) # Notice that copter.xl.data$ was skipped this time? That is because we attached the data file. ggplot(copter.csv.data, aes(x=WL, y=Time)) + geom_point(aes(color = Group)) + geom_smooth(aes(color = Group), method = "lm") + theme_classic() # Compare that to ggplot(copter.csv.data, aes(x=WL, y=Time)) + geom_point(aes(color = Group)) + geom_smooth(method = "lm", color="black") + theme_classic() # Can you see code that made the difference? See how these two graphs help answer different questions? # Why is there so much scatter for any one wing length? # Now try some other scatter plots of your own - for example, Time ~ WL and different themes in ggthemes # So what did you accomplish today? You: # learned about memory management (computer's - not your's) # loaded data in a couple different ways and learned about details of that process # attached a data file and used dplyr to group and calculate means # started seeing differences between categorical groups and relationships between continuous variables. # started becoming a graphic artist # Stats to be done later will examine multiple effects (Group, WL, BW, Fold, and Step) SIMULTANEOUSLY AND SEPARATE FROM OTHER EFFECTS