Personal tools
You are here: Home Software Packages R Project for Statistical Computing Data Management & Programming using R Merging and Joining Data with R

Merging and Joining Data with R

A few exercises

mergingandjoining.r — Plain Text, 2Kb

File contents

# Load some sample data
df1 = data.frame(CaseId=c(1:3, 1:3),Time=c(rep(1,3), rep(2,3)),Result=c(rep("Pass",3),rep("Fail",3)))
df2 = data.frame(CaseId=c(1,2,2,7),Time=c(1,1,2,1),Version=c(rep("One",2),rep("Two",2)))

# Inner Join, the default.  Merges on all matching field names (natural join)
innerjoin<-merge(df1, df2)
# Inner Join. Explicit BY variable
innerjoinbyvar<-merge(df1, df2, by="CaseId")
# Better to be explicit
innerjoinbyvars<-merge(df1,df2, by=c("CaseId","Time"))
# Outer
outerjoin<-merge(df1, df2, all=TRUE)
# Outer Explicit by variable
outerjoinbyvar<-merge(df1, df2, by="CaseId", all=TRUE)
# Left Outer
leftjoin<-merge(df1, df2, all.x=TRUE)
# Left Outer Explicit BY variable
leftjoinbyvar<-merge(df1, df2, by="CaseId", all.x=TRUE)
# Right Outer
rightjoin<-merge(df1, df2,  all.y=TRUE)
# Right Outer Explicit by variable
rightjoinbyvar<-merge(df1, df2, by="CaseId", all.y=TRUE)

# Load more data
emp = data.frame(LastName=c("Rafferty", "Jones", "Steinberg","Robinson","Smith", "John"),DeptID=c(31,33,33,34,34,NA))
dept = data.frame(DeptName=c("Sales", "Engineering", "Clerical","Marketing"),DeptID=c(31,33,34,35) )
# More joins
naturaljoin<-merge(emp, dept)
equijoin<-merge(emp, dept, by="DeptID")
lefto<-merge(emp, dept, by="DeptID", all.x=TRUE)
righto<-merge(emp, dept, by="DeptID", all.y=TRUE)

# Cartesian, cross-join?
MinM<-merge(merge(df1,df2), dept)
MMged<-merge(innerjoin, dept)

# Load more data (from help pages)
authors <- data.frame(
    surname = c("Tukey", "Venables", "Tierney", "Ripley", "McNeil"),
    nationality = c("US", "Australia", "US", "UK", "Australia"),
    deceased = c("yes", rep("no", 4)))
books <- data.frame(
    name = c("Tukey", "Venables", "Tierney",
             "Ripley", "Ripley", "McNeil", "R Core"),
    title = c("Exploratory Data Analysis",
              "Modern Applied Statistics ...",
              "LISP-STAT",
              "Spatial Statistics", "Stochastic Simulation",
              "Interactive Data Analysis",
              "An Introduction to R"),
    other.author = c(NA, "Ripley", NA, NA, NA, NA,
                     "Venables & Smith"))

(m1 <- merge(authors, books, by.x = "surname", by.y = "name"))
(m2 <- merge(books, authors, by.x = "name", by.y = "surname"))
stopifnot(as.character(m1[,1]) == as.character(m2[,1]),
          all.equal(m1[, -1], m2[, -1][ names(m1)[-1] ]),
          dim(merge(m1, m2, by = integer(0))) == c(36, 10))

## "R core" is missing from authors and appears only here :
m3 <- merge(authors, books, by.x = "surname", by.y = "name", all = TRUE)

## Aggregate data example
classdata<-data.frame(workshop=c(rep(c("R","SAS"),4)), gender=c(rep("f",3),"NA", rep("m",4)), q1=c(1,2,2,3,4,5,5,6),q2=c(1,1,2,1,5,4,3,5))
# Add summary statistics to data frame
classdata$q1mean<-mean(classdata$q1)

# Merge summary statistics back into a data frame by GROUP
# Cool plyr function, not sure whether embedding in a merge is the best way?
library(plyr)
classdatawithmeans<-merge(classdata,ddply(classdata,~gender, summarise, q1meanbysex=mean(q1)), by="gender")
classdatawithmeans
Document Actions

Copyright ©2014, The Pennsylvania State University | Privacy and Legal Statements
Contact the Help Site Administrator | Last modified Jan 28, 2013 | Weblion Partner