In survey research, our datasets nearly always comprise variables with mixed measurement levels – in particular, nominal, ordinal and continuous, or in R-speak, unordered factors, ordered factors and numeric variables. Sometimes it is useful to be able to do blanket tests of one set of variables (possibly of mixed level) against another without having to worry about which test to use.

For this we have developed an omni function which can do binary tests of significance between pairs of variables, either of which can be any of the three aforementioned levels. We have also generalised the function to include other kinds of variables such as lat/lon for GIS applications, and to distinguish between integer and continuous variables, but the version I am posting below sticks to just those three levels. Certainly one can argue about which tests are applicable in which precise case, but at least the principle might be interesting to my dear readeRs.

I will write another post soon about using this function in order to display heatmaps of significance levels.

The function returns the p value, together with attributes for the sample size and test used. It is also convenient to test if the two variables are literally the same variable. You can do this by providing your variables with an attribute “varnames”. So if attr(x,”varnames”) is the same as attr(y,”varnames”) then the function returns 1 (instead of 0, which would be the result if you hadn’t provided those attributes).

```
``````{r}

#some helper functions

classer=function(x){

y=class(x)[1]

s=switch(EXPR=y,"integer"="con","factor"="nom","character"="str","numeric"="con","ordered"="ord","logical"="log")

s

}

xc=function(stri,sepp=" ") (strsplit(stri, sepp)[[1]]) #so you can type xc("red blue green") instead of c("red","blue","green")

#now comes the main function

xtabstat=function(v1=xy,v2,level1="nom",level2="nom",spv=F,...){

p=1

if(length(unique(v1))<2 | length(unique(v2))<2) p else {

havevarnames=!is.null(attr(v1,"varnames")) & !is.null(attr(v2,"varnames"))

notsame=T; if (havevarnames)notsame=attr(v1,"varnames")!=attr(v2,"varnames")

if(!havevarnames) warning(paste("If you don't provide varnames I can't be sure the two variables are not identical"),attr(v2,"label"),attr(v2,"label"))

if(notsame | !havevarnames){

if(min(length(which(table(v1)!=0)),length(which(table(v2)!=0)))>1) {

level1=classer(v1)

level2=classer(v2)

if(level1=="str") level1="nom"

if(level2=="str") level2="nom"

# if(attr(v1,"ncol")==2 & attr(v2,"ncol")==9)

if(level1 %in% xc("nom geo") & level2 %in% xc("nom geo")) {if(class(try(chisq.test(v1,v2,...)))!="try-error"){

pp=chisq.test(v1,factor(v2),simulate.p.value=spv,...)

p=pp$p.value;attr(p,"method")="Chi-squared test"

attr(p,"estimate")=pp$statistic

}else p=1

}

else if(level1=="ord" & level2 %in% xc("nom geo"))

{if(class(try(kruskal.test(v1,factor(v2),...)))!="try-error"){

pp=kruskal.test(v1,factor(v2),...)

ppp<<-pp

p=pp$p.value

attr(p,"estimate")=pp$statistic

} else {

p=1

attr(p,"method")="Kruskal test"

}

}

else if(level1 %in% xc("nom geo") & level2=="ord")

{if(class(try(kruskal.test(v2,factor(v1),...)))!="try-error"){

pp=kruskal.test(v2,factor(v1),...)

p=pp$p.value;attr(p,"estimate")=pp$statistic

} else {

p=1

attr(p,"method")="Kruskal test"

}

}

else if((level1=="ord" & level2=="ord") | (level1=="ord" & level2=="con") | (level1=="con" & level2=="ord")) {if(class(try(cor.test(as.numeric(v1),as.numeric (v2),method="spearman",...)))!="try-error") {pp=cor.test(as.numeric(v1),as.numeric (v2),method="spearman",...);p=pp$p.value;attr(p,"method")="Spearman rho.";attr(p,"estimate")=pp$estimate} else cat("not enough finite observations for Spearman")}

else if( level1=="con" & level2 %in% xc("nom geo")) {

if(class(try(anova(lm(as.numeric(v1)~v2))))!="try-error"){

pp=anova(lm(as.numeric(v1)~v2));p=pp$"Pr(>F)"[1];attr(p,"estimate")=pp["F value"];attr(p,"method")="ANOVA F"

}else p=1}

else if( level1 %in% xc("nom geo") & level2 %in% xc("con")) {

if(class(try(anova(lm(as.numeric(v2)~v1))))!="try-error"){

pp=anova(lm(as.numeric(v2)~v1));p=pp$"Pr(>F)"[1];attr(p,"estimate")=pp["F value"];attr(p,"method")="ANOVA F"

}else p=1}

else if( level1=="con" & level2 %in% xc("ord")) {

if(class(try(anova(lm(as.numeric(v1)~v2))))!="try-error"){

pp=anova(lm(as.numeric(v1)~v2));p=pp$"Pr(>F)"[1];attr(p,"estimate")=pp["F value"];attr(p,"method")="ANOVA F"

}else p=1}

else if( level1=="ord" & level2 %in% xc("con")) {

if(class(try(anova(lm(as.numeric(v2)~v1))))!="try-error"){

pp=anova(lm(as.numeric(v2)~v1));p=pp$"Pr(>F)"[1];attr(p,"estimate")=pp["F value"];attr(p,"method")="ANOVA F"

}else p=1}

##TODO think if these are the best tests

else if(level1=="con" & level2=="con")

{

# ;

pp=cor.test(as.numeric(v1),as.numeric(v2))

p=pp$p.value

attr(p,"method")="Pearson correlation"

attr(p,"estimate")=pp$estimate

}

# else if(level1=="str" | level2 =="str") stop(P("You are trying to carry out stats tests for a string variable",attr(v1,"varnames")," or ",attr(v2,"varnames"),". You probably want to convert to nominal."))

else {p=1

attr(p,"estimate")=NULL

}

attr(p,"N")=nrow(na.omit(data.frame(v1,v2)))

}

} else {p=1;attr(p,"N")=sum(!is.na(v1))} #could put stuff here for single-var analysis

if(is.na(p))p=1

p

}

}

## now let's try this out on a mixed dataset. Load mtcars and convert some vars to ordinal and nominal.

mt=mtcars

mt$gear=factor(mt$gear,ordered=T)

mt$cyl=factor(mt$cyl,ordered=F)

s=sapply(mt,function(x){sapply(mt,function(y){

xtabstat(x,y)

})

}

)

heatmap(s)

```

Hi Steve,

Could you please fix all of your images to have them uploaded instead of using copy-paste from knitr? This copy-pasting can not be read into r-bloggers.

Thanks!

Tal

Fixed on this post – sorry, didn’t realise r-bloggers doesn’t like inline images like this.

I’m not sure why but this website is loading incredibly

slow for me. Is anyone else having this issue or is it a

problem on my end? I’ll check back later and see if

the problem still exists.