Omni test for statistical significance

Thu 09 May 2013

Filed under odds and ends

Tags R reproducibleResearch surveyResearch

Rplot13

In survey research, our datasets nearly always comprise variables with mixed measurement levels - in particular, nominal, ordinal and continuous, or in R-speak, unordered factors, ordered factors and numeric variables. Sometimes it is useful to be able to do blanket tests of one set of variables (possibly of mixed level) against another without having to worry about which test to use.

For this we have developed an omni function which can do binary tests of significance between pairs of variables, either of which can be any of the three aforementioned levels. We have also generalised the function to include other kinds of variables such as lat/lon for GIS applications, and to distinguish between integer and continuous variables, but the version I am posting below sticks to just those three levels. Certainly one can argue about which tests are applicable in which precise case, but at least the principle might be interesting to my dear readeRs.

I will write another post soon about using this function in order to display heatmaps of significance levels.

The function returns the p value, together with attributes for the sample size and test used. It is also convenient to test if the two variables are literally the same variable. You can do this by providing your variables with an attribute “varnames”. So if attr(x,“varnames”) is the same as attr(y,“varnames”) then the function returns 1 (instead of 0, which would be the result if you hadn’t provided those attributes).

`{r}

#some helper functions

classer=function(x){
y=class(x)[1]

s=switch(EXPR=y,“integer”=“con”,“factor”=“nom”,“character”=“str”,“numeric”=“con”,“ordered”=“ord”,“logical”=“log”)
s
}

xc=function(stri,sepp=" “) (strsplit(stri, sepp)[[1]]) #so you can type xc(”red blue green“) instead of c(”red“,”blue“,”green“)

#now comes the main function
xtabstat=function(v1=xy,v2,level1=“nom”,level2=“nom”,spv=F,…){
p=1
if(length(unique(v1))<2 | length(unique(v2))<2) p else {

havevarnames=!is.null(attr(v1,“varnames”)) & !is.null(attr(v2,“varnames”))
notsame=T; if (havevarnames)notsame=attr(v1,“varnames”)!=attr(v2,“varnames”)
if(!havevarnames) warning(paste(“If you don’t provide varnames I can’t be sure the two variables are not identical”),attr(v2,“label”),attr(v2,“label”))
if(notsame | !havevarnames){

if(min(length(which(table(v1)!=0)),length(which(table(v2)!=0)))>1) {
level1=classer(v1)
level2=classer(v2)
if(level1==“str”) level1=“nom”
if(level2==“str”) level2=“nom”

# if(attr(v1,“ncol”)==2 & attr(v2,“ncol”)==9)
if(level1 %in% xc(“nom geo”) & level2 %in% xc(“nom geo”)) {if(class(try(chisq.test(v1,v2,…)))!=“try-error”){
pp=chisq.test(v1,factor(v2),simulate.p.value=spv,…)
p=pp$p.value;attr(p,“method”)=“Chi-squared test”
attr(p,“estimate”)=pp$statistic
}else p=1
}

else if(level1==“ord” & level2 %in% xc(“nom geo”))
{if(class(try(kruskal.test(v1,factor(v2),…)))!=“try-error”){
pp=kruskal.test(v1,factor(v2),…)
ppp<<-pp
p=pp$p.value
attr(p,“estimate”)=pp$statistic
} else {
p=1
attr(p,“method”)=“Kruskal test”
}
}

else if(level1 %in% xc(“nom geo”) & level2==“ord”)
{if(class(try(kruskal.test(v2,factor(v1),…)))!=“try-error”){
pp=kruskal.test(v2,factor(v1),…)
p=pp$p.value;attr(p,“estimate”)=pp$statistic
} else {
p=1
attr(p,“method”)=“Kruskal test”
}
}

else if((level1==“ord” & level2==“ord”) | (level1==“ord” & level2==“con”) | (level1==“con” & level2==“ord”)) {if(class(try(cor.test(as.numeric(v1),as.numeric (v2),method=“spearman”,…)))!=“try-error”) {pp=cor.test(as.numeric(v1),as.numeric (v2),method=“spearman”,…);p=pp$p.value;attr(p,“method”)=“Spearman rho.”;attr(p,“estimate”)=pp$estimate} else cat(“not enough finite observations for Spearman”)}

else if( level1==“con” & level2 %in% xc(“nom geo”)) {
if(class(try(anova(lm(as.numeric(v1)~v2))))!=“try-error”){

pp=anova(lm(as.numeric(v1)~v2));p=pp$“Pr(>F)”[1];attr(p,“estimate”)=pp[“F value”];attr(p,“method”)=“ANOVA F”
}else p=1}

else if( level1 %in% xc(“nom geo”) & level2 %in% xc(“con”)) {
if(class(try(anova(lm(as.numeric(v2)~v1))))!=“try-error”){

pp=anova(lm(as.numeric(v2)~v1));p=pp$“Pr(>F)”[1];attr(p,“estimate”)=pp[“F value”];attr(p,“method”)=“ANOVA F”
}else p=1}

else if( level1==“con” & level2 %in% xc(“ord”)) {
if(class(try(anova(lm(as.numeric(v1)~v2))))!=“try-error”){

pp=anova(lm(as.numeric(v1)~v2));p=pp$“Pr(>F)”[1];attr(p,“estimate”)=pp[“F value”];attr(p,“method”)=“ANOVA F”
}else p=1}

else if( level1==“ord” & level2 %in% xc(“con”)) {
if(class(try(anova(lm(as.numeric(v2)~v1))))!=“try-error”){

pp=anova(lm(as.numeric(v2)~v1));p=pp$“Pr(>F)”[1];attr(p,“estimate”)=pp[“F value”];attr(p,“method”)=“ANOVA F”
}else p=1}

##TODO think if these are the best tests
else if(level1==“con” & level2==“con”)
{
# ;
pp=cor.test(as.numeric(v1),as.numeric(v2))
p=pp$p.value
attr(p,“method”)=“Pearson correlation”
attr(p,“estimate”)=pp$estimate

}

# else if(level1==“str” | level2 ==“str”) stop(P(“You are trying to carry out stats tests for a string variable”,attr(v1,“varnames”)," or “,attr(v2,”varnames“),”. You probably want to convert to nominal.“))
else {p=1
attr(p,”estimate“)=NULL
}
attr(p,”N“)=nrow(na.omit(data.frame(v1,v2)))
}
} else {p=1;attr(p,”N“)=sum(!is.na(v1))} #could put stuff here for single-var analysis

if(is.na(p))p=1
p
}
}

## now let’s try this out on a mixed dataset. Load mtcars and convert some vars to ordinal and nominal.
mt=mtcars
mt$gear=factor(mt$gear,ordered=T)
mt$cyl=factor(mt$cyl,ordered=F)

s=sapply(mt,function(x){sapply(mt,function(y){
xtabstat(x,y)
})
}
)
heatmap(s)

`

Rplot13


Comments


socialdatablog © Steve Powell Powered by Pelican and Twitter Bootstrap. Icons by Font Awesome and Font Awesome More