# Use "objects()" or "ls()" to see the objects in the work space objects() # To see the files in the folder/directory use "dir()" dir() # To see the commands that have been entered, use "history" history() # The SPSS data can be converted to a R object by using one of the functions # in the "foreign" package # Use the "library(package-name)" to load or make available any package # Entering "library()" will display all of the packages that have been installed library() library(foreign) # If we want to verify the package is available, we can enter the "search()" function search() # The numbers in the brackets are the order in which the search for data # and/or functions is done. ".GlobalEnv" is the user workspace # The contents of the foreign package can be displayed by using the search # function with the number in the search order. # Make that objects function objects(2) # We want to use the "read.spss" function help(read.spss) # Read the data into an R object H96B <- read.spss("H96B.sav",to.data.frame=TRUE) # Use the "dim" function to display the number of rows and columns dim(H96B) # To see the type of object, use "class" class(H96B) # To see the columns names, use "names" names(H96B) # Operations can be done by row or column or selected parts of either # The data can be selectively viewed by selecting rows and columns # The square brackets "[ ]" are used with the rows and columns separated by a comma # To see the entire column for E950 enter H96B[,10] # E950 is column 10, nothing preceeding the comma indicates all rows H96B[,10] # Not really useful. There are several methods for viewing parts of the data # The ":" operator can be used to specify a range. The "c" function can build a list # To display rows 1 - 10 H96B[1:10,] # To display rows 2,4,6,7, and 9 H96B[c(2,4,6,7,9),] # The negative sign will remove rows and/or columns from the list H96B[1:10,- c(1,2)] # An individual column may also be referenced by using a "$" followed by the column name H96B$E769 # To display a limited number of rows, use the brackets and specify row numbers H96B$E769[1:20] # Display the class of the object class(H96B$E769[1:20]) # R has taken the column of data and "converted" it into a numeric vector # For a quick overview of the data, use the "summmary" function summary(H96B) # HHID and PN were character values in the original data. R is treating them as factors # Thus the frequencies after the individual values. The (Other):10946 is telling us that # there are 10946 distinct values in addition to the 6 displayed. The other values # in the data are numeric. "NA" is the R version of a missing value. They may # require different treatment than missings in SAS, SPSS, or stata mean(H96B$E943) # The mean returns a NA because there are NA values in the data. R will not # remove the NA values by default in many functions. An option needs to be # set to tell R to remove the NA values mean(H96B$E943,na.rm=TRUE) # Frequencies can be generated using the "table" function table(H96B$E828) # Modification of variables # For E781, change "5" (NO) to "0". Change "3" to "1" and "8" and "9" to "NA" # Table of values for this variable table(H96B$E781) # First, generate a vector of logical(TRUE/FALSE) values for the variable equal to "5" is5 <- H96B$E781 == 5 # Display values 50-70 is5[50:70] # Class of is5? class(is5) # As a double check, we can sum the number of "TRUE" elements in our vector # If we use the "sum" function, R will coerce the TRUE/FALSE values to 1/0 sum(is5) # Ooops! We need to ignore the NA values. Any NA value was not assigned a TRUE # or FALSE, it remains NA sum(is5,na.rm=TRUE) # When we use the sum function, R will consider the TRUE/FALSE values to be 1/0 # The "==" operator which checks for equality will not work for NA. To check for # NA, use the "is.na" function X <- NA X X == NA is.na(X) # Objects can be removed/deleted with the "rm" or "remove" functions rm(X) # Back to the conversions..... # We have a vector of values if the variable is equal to 5. We can use it to # specify which elements to change H96B$E781[is5 == TRUE] <- 0 # Check with a table table(H96B$E781) # Success! Now we can do the same with coverting 3 to 1 and 8,9 to NA is3 <- H96B$E781 == 3 H96B$E781[is3 == TRUE] <- 1 table(H96B$E781) is8.9 <- H96B$E781 == 8 | H96B$E781 == 9 sum(is8.9,na.rm=TRUE) H96B$E781[is8.9 == TRUE] <- NA # There should only be 0,1 and NA values now table(H96B$E781) sum(is.na(H96B$E781)) # Since there are other variables with the same coding scheme that need the # same set of operations, we can write our own function to make the changes. # A function can be created or modified by using the "fix" function fix(to.1.or.0) # Insert the following lines after the "{" and before the "}" # Change the first line to "function (variable)" # ----------------------------------------------------------- is.a.5 <- variable == 5 is.a.3 <- variable == 3 is.a.8.9 <- variable == 8 | variable == 9 variable[is.a.5 == TRUE] <- 0 variable[is.a.3 == TRUE] <- 1 variable[is.a.8.9 == TRUE] <- NA # return the variable, everything else is local to the function return(variable) # ----------------------------------------------------------- # When writing functions, the "debug" function helps track what is going on debug(to.1.or.0) # Before changing the data, see if things look like they are working table(to.1.or.0(H96B$E788)) table(H96B$E788) # Turn off the debug!!!! undebug(to.1.or.0) # The table generated from the function is what we expect, so change the data H96B$E788 <- to.1.or.0(H96B$E788) table(H96B$E788) # Do the same with E828 table(H96B$E828) H96B$E828 <- to.1.or.0(H96B$E828) table(H96B$E828) # Combine the HHID (Household) and PN (Person) into 1 variable # Since they are factors and have the leading zeros, we should be able # to use the "paste" function # The "nchar" function can be used to verify that all the variables have 6 or 3 characters table(nchar(as.character(H96B$HHID))) table(nchar(as.character(H96B$PN))) # Create a new variables HHIDPN HHIDPN <- paste(H96B$HHID,H96B$PN,sep="") HHIDPN[1103:1117] # Use "cbind" function to add this to the data H96B <- cbind(HHIDPN,H96B) names(H96B) # Lets remove the HHID and PN variables H96B <- H96B[,-c(2,3)] names(H96B) # Change E954 values of 997, 998, and 999 to NA is900 <- H96B$E954 > 996 H96B$E954[is900 == TRUE] <- NA sum(is.na(H96B$E954)) # Change age of 0 to NA is0 <- H96B$E753 == 0 sum(is0) H96B$E753[is0 == TRUE] <- NA # Change E943 997,998,999 values to NA is900 <- H96B$E943 > 996 H96B$E943[is900 == TRUE] <- NA sum(is.na(H96B$E943)) # Same for E950 is900 <- H96B$E950 > 996 H96B$E950[is900 == TRUE] <- NA sum(is.na(H96B$E950)) # And do the same for 96,98,99 values in E951 (Could leave 96?) is90 <- H96B$E951 > 95 H96B$E951[is90 == TRUE] <- NA sum(is.na(H96B$E951)) # Since we have drinks per day (E951) and days drink per week (E950), calculate drinks per week H96B$DPW <- H96B$E950 * H96B$E951 mean(H96B$DPW,na.rm=TRUE) # Drop height variables - too many NAs names(H96B) H96B <- H96B[,- c(12,13)] names(H96B) # Change E769 and E772 to factors # First remove "extra" categories # Table before any changes table(H96B$E769) # Count the number of NA values and 8,9 values is8.9 <- H96B$E769 > 5 sum(is.na(is8.9)) sum(is8.9,na.rm=TRUE) H96B$E769[is8.9 == TRUE] <- NA sum(is.na(H96B$E769)) table(H96B$E769) # Variable E772 table(H96B$E772) is8.9 <- H96B$E772 > 5 sum(is8.9,na.rm=TRUE) H96B$E772[is8.9 == TRUE] <- NA table(H96B$E772) # Now the two variables have only the categories and NA values so they # can be changed to factors using the "as.factor" function H96B$E772 <- as.factor(H96B$E772) H96B$E769 <- as.factor(H96B$E769) # The levels function will return the categories or allow you to assign them # ?levels levels(H96B$E772) levels(H96B$E772) <- c("Better","Same","Worse") levels(H96B$E769) levels(H96B$E769) <- c("Excellent","VeryGood","Good","Fair","Poor") # The table function now returns the labels for the factors rather than the numbers table(H96B$E769) table(H96B$E772) # Summary on "cleaned" data summary(H96B) ############################################################################## # To do cross tabs - # Create an object with the table # The "margin.table" function can be used to get row and column totals # The totals for the rows/columns may not equal those of the variables because of "NA" values table(H96B$E769,H96B$E772) # The output can be written to an object table(H96B$E769,H96B$E772) -> table1 table1 class(table1) names(table1) dim(table1) # To calculate row and column sums margin.table(table1,1) margin.table(table1,2) # Use "prop.table" to calculate percents of columns or rows prop.table(table1,1) prop.table(table1,2) # Or to find the percent in each cell divide the table by its sum table1 / sum(table1) # Multiply by 100 100 * table1 / sum(table1) # To create a bar chart barplot(margin.table(table1,1)) barplot(margin.table(table1,1),col="blue") barplot(margin.table(table1,2),col="blue") # Transpose the table using the "t" function table1 t(table1) # generate plots for the dataset - don't use this if you have lots of variables! names(H96B) pairs(H96B[,c(2,3,5,8,11,12)]) # Make a subset of those respondents age 50-54 table(H96B$E753) # ?subset subset(H96B,E753 > 49 & E753 < 55) subset(H96B,E753 > 49 & E753 < 55) -> H96B50.54 dim(H96B50.54) # Change variable names on subset ds.names <- names(H96B50.54) ds.names fix(ds.names) # Delete the statement that is there and paste this code to replace it: c("HHIDPN", "Age", "Health", "HealthChange", "HBP", "Diabetes", "Heart", "Cigs", "Drink.Week", "Drink.Day", "Weight", "DPW") # Save the changes ds.names ds.names -> names(H96B50.54) names(H96B50.54) table(H96B50.54$Age) savehistory(file="dataset1.creation")