x <- rnorm(200, mean=18, sd=2) # generate normal distributed data
y <- rnorm(200, mean=22, sd=2) # generate different normal data
df <- data.frame(x=x, y=y) # put it in a data frame
df <- data.frame(melt(as.data.table(df))) # reformat the dataframe

# Plot 
p <- ggplot(df, aes(x=value, fill=variable, color=variable)) +
geom_histogram(binwidth=1, alpha=0.5, position = "identity") +
ggtitle("Comparing means") + xlab("Value") + ylab("Frequency")
ggplotly(p, width=640, height=640)

#T Test
t.test(x,y)

	Welch Two Sample t-test

data:  x and y
t = -20.552, df = 397.77, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -4.501817 -3.715759
sample estimates:
mean of x mean of y 
 17.90109  22.00988


x <- rnorm(200, mean=18, sd=2) # generate normal distributed data
y <- rnorm(200, mean=22, sd=3) # generate different normal data
z <- rnorm(200, mean=22, sd=1) # generate data with the same mean as y
df <- data.frame(x=x, y=y, z=z) # put it in a data frame
df <- data.frame(melt(as.data.table(df))) # reformat the dataframe

# Plot 
p <- ggplot(df, aes(x=value, fill=variable, color=variable)) +
geom_histogram(binwidth=1, alpha=0.5, position = "identity") +
ggtitle("Comparing means") + xlab("Value") + ylab("Frequency")

ggplotly(p, width=640, height=640)

#ANOVA
summary(aov(df$value ~ df$variable))

             Df Sum Sq Mean Sq F value Pr(>F)    
df$variable   2   2011  1005.7   196.5 <2e-16 ***
Residuals   597   3055     5.1                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


plot(TukeyHSD(aov(df$value ~ df$variable)))


# Create 10 groups
x <- rnorm(200, mean=18, sd=2) # generate normal distributed data
y <- rnorm(200, mean=22, sd=3) # generate different normal data
z <- rnorm(200, mean=22, sd=1) # generate data with the same mean as y
a <- rnorm(200, mean=21, sd=3)
b <- rnorm(200, mean=24, sd=2)
c <- rnorm(200, mean=22, sd=5)
d <- rnorm(200, mean=23, sd=4)
e <- rnorm(200, mean=28, sd=3)
f <- rnorm(200, mean=20, sd=2)
g <- rnorm(200, mean=20, sd=2)
df <- data.frame(x=x, y=y, z=z, a=a, b=b, c=c, d=d, e=e, f=f, g=g) # put it in a data frame
df <- data.frame(melt(as.data.table(df))) # reformat the dataframe

# Plot 
p <- ggplot(df, aes(x=value, fill=variable, color=variable)) +
geom_histogram(binwidth=1, alpha=0.5, position = "identity") +
ggtitle("Comparing means") + xlab("Value") + ylab("Frequency")
ggplotly(p, width=640, height=640)

#ANOVA
summary(aov(df$value ~ df$variable))

#Tukey HSD
plot(TukeyHSD(aov(df$value ~ df$variable)), las=1)

              Df Sum Sq Mean Sq F value Pr(>F)    
df$variable    9  11933  1325.8   161.7 <2e-16 ***
Residuals   1990  16312     8.2                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


sq.q <- function(n, group, alpha=0.05) {
    #accepts categorical data and alpha value, prints out simplified tukey plot

    dataframe <- data.frame(group, n) # put in df
    model <- aov(n ~ as.factor(group)) # get aov object

    #get means of factors and merge to one dataframe
    means <- dataframe %>% group_by(group) %>% summarise(n = mean(n))
    table <- merge(x=count(dataframe, group), y=means, by=1) 

    MSE <- summary(model)[1][[1]][[3]][[2]] # get MSE
    df <- summary(model)[1][[1]][[1]][[2]] # get deg.freedom
    N <- nrow(table) # Number of factors
    n_min <- min(table$n.x) # minimum sample size of a factor

    #compute tukey range
    table$tukey <- qtukey(1-alpha, N, df=df)*
        sqrt(MSE/2 * (1/table$n.x + 1/n_min))

    # Absolute differences
    DIFF <- abs(outer(table$n.y, table$n.y, '-'))

    #using tukey estimate above, find number of signficant differences
    tab_tuk <- matrix(table$tukey, nrow=length(table$tukey),
        ncol=length(table$tukey), byrow=TRUE)
    diff_t <- (length(which((DIFF-tab_tuk) > 0)))

    #calculate the actual pairwise tukey range
    #and find number of signficant differnces
    cal_tuk <- qtukey(1-alpha, N, df=df) *
        sqrt(outer(1/table$n.x, 1/table$n.x, "+") * (MSE/2))
    diff_c <- (length(which((DIFF-cal_tuk) > 0)))

    title <- ""
    subtitle <- ""
    caption <- ""
    #labeling and captioning based on accuracy. If n close enough all good
    if ((diff_c - diff_t)%/%2 == 0) {
        title=sprintf("%0.f%% Family-Wise Confidence Level",100*(1-alpha))
        subtitle="Result accurately reflects Tukey HSD"
        caption=sprintf("Tukey HSD finds %1.0f total signficiant differences", diff_c%/%2)
    }
    else {
        title=sprintf("At least %0.f%% Family-Wise Confidence Level",100*(1-alpha))
        subtitle=sprintf("Sample sizes not close enough, Result misses %1.0f significant differences compared to Tukey HSD.",(diff_c - diff_t)%/%2)
        caption=sprintf("Tukey HSD finds %1.0f total signficiant differences", diff_c%/%2)
    } 

    #tukey HSD plot, ordered smallest mean to largest
    ggplot(table, aes(reorder(x=as.factor(group), n.y),
        y=n.y, col=as.factor(group))) +
    
    #plot tukey ranges
    geom_crossbar(aes(ymin=n.y-tukey, ymax=n.y+tukey, fill=as.factor(group)),
        width=0.22, alpha=0.6, linetype="blank") + #
    
    # mean lines across
    geom_hline(aes(yintercept=n.y, color=as.factor(group))) +

    #add labels to end of mean lines, also add labels
    geom_label_repel(aes(N+0.5, n.y, label = group), direction="y") +
    xlab("Factors") + ylab("Means") + theme(legend.position="none") +
    
    labs(title=title,
        subtitle=subtitle,
        caption=caption) +

    scale_color_discrete() # difficult to read without color
}


sq.q(df$value, df$variable)

Back to homepage

The "Square Q Test"¶

A different approach to visualizing the Tukey HSD Post-HOC Test¶

Background¶

T-Test¶

ANOVA¶

Experiment Wise Error Rate¶

Tukey Honest Significant Differences¶

The Problem.¶

The Solution: The Square-Q Plot¶

How to Read It.¶

Considerations¶

Conclusion¶