require(stringr) require(ggplot2) # load data happiness<-read.csv("GlaeserGottliebZivHappinessMSA_Web.csv", stringsAsFactors = F) census<-read.csv("CBSA-EST2013-alldata.csv", stringsAsFactors = F) #clean data happiness$normname<-gsub("PMSA|MSA|NECMA", "", happiness$msaname) happiness$normname<-str_trim(happiness$normname, side = "both") #merge data ##first do a sanity check to make sure we're matching on the right thing which(census$NAME %in% happiness$normname) happiness_census<-merge(happiness, census, by.x = "normname", by.y = "NAME", all.x = T) #plot data qplot(happiness_census$Adjusted.for.demographics.and.income, happiness_census$POPESTIMATE2013) #count happy and unhappy cities over a million in population happiness_census[which(happiness_census$POPESTIMATE2013 > 1000000 & happiness_census$Adjusted.for.demographics.and.income < 0),"normname"] happiness_census[which(happiness_census$POPESTIMATE2013 > 1000000 & happiness_census$Adjusted.for.demographics.and.income > 0),"normname"]