Introduction

This file contains the data analyses of our empirical evaluation regarding the classification of generated white-box tests. The goal of this document is to provide detailed description of how we analyzed the data obtained from the study. As we had two studies – an original and a replication – the analyses are divided correspondingly.

Reading the inputs

Parsing participant answers

Each participant answer is stored in a JSON file. These are read into data frames.

read_answers_from_folder <- function(results_folder) {
  answer_files <- list.files(results_folder, pattern="*.json", recursive=TRUE, full.names=TRUE)
  result_answers <- data.frame(row.names = c("PID","Id","IsOK"), stringsAsFactors = FALSE)
  for (json in answer_files) {
    # Getting the answers of the participants from the JSON file
    answers <- data.frame(fromJSON(readLines(json),simplifyDataFrame=TRUE),stringsAsFactors = FALSE)
    # Getting the PID from the base directory
  
    if(suppressWarnings(!is.na(as.numeric(basename(dirname(json)))))) {
      pid <- as.numeric(rep(basename(dirname(json)),nrow(answers))) # For the original study
    } else {
      pid <- as.numeric(rep(basename(dirname(dirname(json))),nrow(answers))) # For the replication
    }
    # Creating the column for the participant identifier
    answers_with_pid <- cbind(pid,answers)
    # Appending the new rows the previouses
    result_answers <- rbind(result_answers,answers_with_pid)
  }
  return(result_answers)
}

result_answers <- read_answers_from_folder(params$results_folder)
r_result_answers <- read_answers_from_folder(params$r_results_folder)

Adding project and method information to the input

The answers are extended with project and method information based on the identifiers in the result file.


# Adding the project column
result_answers$Project <- ifelse(result_answers$pid>40,"MathNet","NBitcoin")
r_result_answers$Project <- ifelse(r_result_answers$pid>40,"NetTopologySuite","NodaTime")

# Coding method information to an array
nbitcoin_methods <- c(
  rep("CompareTo",3),
  rep("Constructor",3),
  rep("Equals",3),
  rep("Min",3),
  rep("Plus",3)
)
mathnet_methods <- c(
  rep("Combinations",3),
  rep("CombinationsWithRepetition",3),
  rep("Permutations",3),
  rep("Variations",3),
  rep("VariationsWithRepetition",3)
)
nodatime_methods <- c(
  rep("AddTo",3),
  rep("Between",3),
  rep("HasDateComponent",3),
  rep("Minus",3),
  rep("ToDuration",3)
)
nettopology_methods <- c(
  rep("Extract",3),
  rep("IncreasingDirection",3),
  rep("IndexOf",3),
  rep("IsRing",3),
  rep("MinCoordinate",3)
)

# Creating the mapping between test identifiers, projects and methods
test_method_mapping <- data.frame(
  Results.Id = rep(0:14,2),
  Method = c(nbitcoin_methods,mathnet_methods),
  Project = c(rep("NBitcoin",15),rep("MathNet",15))
)
r_test_method_mapping <- data.frame(
  Results.Id = rep(0:14,2),
  Method = c(nodatime_methods,nettopology_methods),
  Project = c(rep("NodaTime",15),rep("NetTopologySuite",15))
)

# Adding the method name to each row
result_answers <- plyr::join(result_answers,test_method_mapping,by=c("Results.Id","Project"))
r_result_answers <- plyr::join(r_result_answers,r_test_method_mapping,by=c("Results.Id","Project"))

# Adding test identifiers to each row
result_answers$TestId <- car::recode(as.factor(result_answers$Results.Id),"0='T0.1'; 1='T1.1'; 2='T2.1'; 3='T3.2'; 4='T4.2'; 5='T5.2'; 6='T6.3'; 7='T7.3'; 8='T8.3'; 9='T9.4'; 10='T10.4'; 11='T11.4'; 12='T12.5'; 13='T13.5'; 14='T14.5'")
r_result_answers$TestId <- car::recode(as.factor(r_result_answers$Results.Id),"0='T0.1'; 1='T1.1'; 2='T2.1'; 3='T3.2'; 4='T4.2'; 5='T5.2'; 6='T6.3'; 7='T7.3'; 8='T8.3'; 9='T9.4'; 10='T10.4'; 11='T11.4'; 12='T12.5'; 13='T13.5'; 14='T14.5'")

rm(nbitcoin_methods,mathnet_methods,nodatime_methods,nettopology_methods,test_method_mapping,r_test_method_mapping)

Parsing golden answers

Parsing the golden answers created prior to the study. These are used to evaluate participants’ answers.

nbitcoin_golden_answers <- read.csv(file.path(params$golden_folder,"nbitcoin-gold.csv"), header=TRUE, sep = ";",na.strings=c("",""),stringsAsFactors = FALSE)
mathnet_golden_answers <- read.csv(file.path(params$golden_folder,"mathnet-gold.csv"), header=TRUE, sep = ";",na.strings=c("",""),stringsAsFactors = FALSE)
nodatime_golden_answers <- read.csv(file.path(params$r_golden_folder,"nodatime-gold.csv"), header=TRUE, sep = ";",na.strings=c("",""),stringsAsFactors = FALSE)
nettopology_golden_answers <- read.csv(file.path(params$r_golden_folder,"nettopology-gold.csv"), header=TRUE, sep = ";",na.strings=c("",""),stringsAsFactors = FALSE)

Parsing background questionnaire answers

Before the study session, each participant filled a background questionnaire. The answers are parsed from a CSV file.

parse_background_answers <- function(results_folder) {
  bg_answers <- list.files(results_folder, pattern="background.csv", recursive=TRUE, full.names=TRUE)
  result_bg_answers <- data.frame()
  for (answer_file in bg_answers) {
    answers <- read.csv(answer_file,header=TRUE,sep=";",as.is = TRUE)
    colnames(answers)[1] <- "PID" 
    result_bg_answers <- rbind(result_bg_answers,answers)  
  }
} 

result_bg_answers <- parse_background_answers(params$results_folder)
r_result_bg_answers <- parse_background_answers(params$r_results_folder)

Parsing exit survey answers

After the study, each participant filled a survey, which contained questions about the difficulties of the task performed. The results are parsed from a CSV.

parse_exit_answers <- function(results_folder) {
  exit_answers <- list.files(results_folder, pattern="exit.csv", recursive=TRUE, full.names=TRUE)
  result_exit_answers <- data.frame()
  for (answer_file in exit_answers) {
    answers <- read.csv(answer_file,header=TRUE,sep=";",as.is = TRUE)
    colnames(answers)[1] <- "PID" 
    result_exit_answers <- rbind(result_exit_answers,answers)  
  }
  return(result_exit_answers)
}

exit_answers <- parse_exit_answers(params$results_folder)
r_exit_answers <- parse_exit_answers(params$r_results_folder)

Parsing video logs of original study

The activities of participants in the original study are extracted using coding of the screen recordings. These are saved to CSV files and parsed to data frames.

video_files <- list.files(params$results_folder, pattern="video.csv", recursive=TRUE, full.names=TRUE)
video_lengths <- data.frame()
result_videos <- data.frame()
for (video in video_files) {
  # Getting the number of lines to skip at head (max. 20)
  lines_to_skip <- grep("^Time,", readLines(video, n = 20))-1
  # Reading annotations
  annotations <- read.csv(video,header=TRUE,skip=lines_to_skip,sep=",",as.is = TRUE)
  # Creating PID column based on the number of rows
  pid <- rep(strsplit(readLines(video,n=1),split = ',')[[1]][2],nrow(annotations))
  # Adding the PID column to the annotations
  annotations_with_pid <- cbind(pid, annotations)
  # Appending the new annotations to the previouses
  result_videos <- rbind(result_videos,annotations_with_pid)
  # Appending video length
  video_lengths <- filter(result_videos %>% select(pid,Media.total.length) %>% distinct_(),pid != 55 & pid != 59)
}
video_lengths_nbitcoin <- filter(video_lengths, as.numeric(pid) < 41)
video_lengths_mathnet <- filter(video_lengths, as.numeric(pid) > 40)

rm(video,lines_to_skip,video_files,annotations,annotations_with_pid,pid)

Parsing activity logs of the replication study

The replication study activities were properly recorded in log files (as oppose to the original study), thus we use it to easily obtain activity-related information.

parse_logs <- function(project_log) {
  activity_files <- list.files(params$r_results_folder, pattern = project_log, recursive = TRUE, full.names = TRUE)
  raw_log_data <- tibble()
  for(file in activity_files) {
    # Parse the file with the | separator
    portal_log <- read.table(file, fill=TRUE, header=FALSE, sep="|", colClasses=c(rep("character", 4)),stringsAsFactors = FALSE)
    names(portal_log) <- c("Date","Action","Location","Mark")
    # Grab the experiment-vs.log file in the corresponding folder
    vs_log <- read.csv(file.path(dirname(dirname(file)),"experiment-vs.log"), header=F, sep = "|",na.strings=c("",""),stringsAsFactors = FALSE)
    names(vs_log) <- c("Date","Action","Location")
    # Merge the activities by the date column
    joined <- portal_log %>% bind_rows(vs_log)
    joined$PID <- rep(basename(dirname(dirname(file))),length(joined$Date))
    joined <- joined %>% arrange(Date)
    raw_log_data <- raw_log_data %>% bind_rows(joined)
  }
  return(raw_log_data)
}

nodatime_test_to_seq <- function(filePath) {
  if(grepl("AddToTest322", filePath, fixed=TRUE)) {
    return("T0.1")
  } else if(grepl("AddToTest327",filePath, fixed=TRUE)) {
    return("T1.1")
  } else if(grepl("AddToTestThrowsDivideByZeroException305",filePath,  fixed=TRUE)) {
    return("T2.1")
  } else if(grepl("BetweenTest171", filePath,  fixed=TRUE)) {
    return("T3.2")
  } else if(grepl("BetweenTest410", filePath,  fixed=TRUE)) {
    return("T4.2")
  } else if(grepl("BetweenTestThrowsArgumentException616",filePath,  fixed=TRUE)) {
    return("T5.2")
  } else if(grepl("HasDateComponentGetTest386", filePath,  fixed=TRUE)) {
    return("T6.3")
  } else if(grepl("HasDateComponentGetTest407",filePath,  fixed=TRUE)) {
    return("T7.3")
  } else if(grepl("HasDateComponentGetTest757", filePath,  fixed=TRUE)) {
    return("T8.3")
  } else if(grepl("MinusTest479",filePath, fixed=TRUE)) {
    return("T9.4")
  } else if(grepl("MinusTestThrowsArgumentNullException26", filePath,  fixed=TRUE)) {
    return("T10.4")
  } else if(grepl("MinusTestThrowsArgumentNullException333", filePath,  fixed=TRUE)) {
    return("T11.4")
  } else if(grepl("ToDurationTest01", filePath,  fixed=TRUE)) {
    return("T12.5")
  } else if(grepl("ToDurationTest122",filePath,  fixed=TRUE)) {
    return("T13.5")
  } else if(grepl("ToDurationTestThrowsInvalidOperationException782",filePath,  fixed=TRUE)) {
    return("T14.5")
  } else if(grepl("Period.cs", filePath,  fixed=TRUE)) { # CUT
    return("CUT")
  } else if(grepl("PeriodTest.cs", filePath, fixed=TRUE)) { # PUT
    return("PUT")
  } else { # SUT
    return("SUT")
  }
}

nettopology_test_to_seq <- function(filePath) {
  if(grepl("ExtractTest861", filePath, fixed=TRUE)) {
    return("T0.1")
  } else if(grepl( "ExtractTestThrowsArgumentException547",filePath, fixed=TRUE)) {
    return("T1.1")
  } else if(grepl("ExtractTestThrowsArgumentException834",filePath,  fixed=TRUE)) {
    return("T2.1")
  } else if(grepl("IncreasingDirectionTest112", filePath,  fixed=TRUE)) {
    return("T3.2")
  } else if(grepl("IncreasingDirectionTest202", filePath,  fixed=TRUE)) {
    return("T4.2")
  } else if(grepl("IncreasingDirectionTest578",filePath,  fixed=TRUE)) {
    return("T5.2")
  } else if(grepl("IndexOfTest510", filePath,  fixed=TRUE)) {
    return("T6.3")
  } else if(grepl("IndexOfTest618",filePath,  fixed=TRUE)) {
    return("T7.3")
  } else if(grepl("IndexOfTest807", filePath,  fixed=TRUE)) {
    return("T8.3")
  } else if(grepl( "IsRingTest488",filePath, fixed=TRUE)) {
    return("T9.4")
  } else if(grepl("IsRingTest617", filePath,  fixed=TRUE)) {
    return("T10.4")
  } else if(grepl("IsRingTest89", filePath,  fixed=TRUE)) {
    return("T11.4")
  } else if(grepl("MinCoordinateTest441", filePath,  fixed=TRUE)) {
    return("T12.5")
  } else if(grepl("MinCoordinateTest901",filePath,  fixed=TRUE)) {
    return("T13.5")
  } else if(grepl("MinCoordinateTest993",filePath,  fixed=TRUE)) {
    return("T14.5")
  } else if(grepl("CoordinateArrays.cs", filePath,  fixed=TRUE)) { # CUT
    return("CUT")
  } else if(grepl("CoordinateArraysTest.cs", filePath, fixed=TRUE)) { # PUT
    return("PUT")
  } else { # SUT
    return("SUT")
  }
}

testSeq_to_seq <- function(testSeq) {
  s <- as.integer(testSeq)+1
  id <- ceiling(s/3)
  return(paste("T",testSeq,".",id,sep = ""))
}

# These were the start times for the sessions
nodatime_start_time <-strptime("11/30/2017 3:00:00 PM", format = "%m/%d/%Y %I:%M:%S %p")
nettopology_start_time <- strptime("12/07/2017 2:52:00 PM", format="%m/%d/%Y %I:%M:%S %p")

# Parsing the logs
nodatime_log <- suppressWarnings(parse_logs("NodaTime.log"))
nettopology_log <- suppressWarnings(parse_logs("NetTopologySuite.log"))

Evaluating participant answers

Comparison with golden answers

The answers of each participant is compared with the golden answers. The comparison yields four outcomes using binary classification: TP, TN, FP, FN. We marked a case as positive where the test encoded an error, while negative if the test was error-free.

perform_answer_comparison <- function(result_answers, golden_answers) {
  participant_results <- data.frame()
  
  # Iterating through participants
  for(pid in unique(result_answers$pid)) {
    result_column <- c()
  
    # Iterating through answers for checking correctness
    for(test in result_answers[result_answers$pid == pid,]$Results.Id) {

      participant_answer<- result_answers[result_answers$pid == pid & result_answers$Results.Id == test,]$Results.IsOK
      
      if(golden_answers[golden_answers$id == test,]$isok == TRUE) {
        # If the golden answer is OK
        if(participant_answer == TRUE) {
          result_column <- rbind(result_column,"TN")
        } else {
          result_column <- rbind(result_column,"FP")
        }
      } else {
        # If the golden answer is Wrong
        if(participant_answer == FALSE) {
          result_column <- rbind(result_column,"TP")
        } else {
          result_column <- rbind(result_column,"FN")
        }
      }
    }
    participant_result_table <- cbind(result_answers[result_answers$pid == pid,],result_column)
    colnames(participant_result_table)[7] <- "Check"
    participant_results <- rbind(participant_results,participant_result_table)
  }
  
  return(participant_results)
}

nbitcoin_participant_results <- perform_answer_comparison(result_answers %>% filter(pid < 41), nbitcoin_golden_answers)
mathnet_participant_results <- perform_answer_comparison(result_answers %>% filter(pid >= 41), mathnet_golden_answers)
participant_results <- nbitcoin_participant_results %>% bind_rows(mathnet_participant_results)

nodatime_participant_results <- perform_answer_comparison(r_result_answers %>% filter(pid < 41), nodatime_golden_answers)
nettopology_participant_results <- perform_answer_comparison(r_result_answers %>% filter(pid >= 41), nettopology_golden_answers)
r_participant_results <- nodatime_participant_results %>% bind_rows(nettopology_participant_results)

Summary of binary classification

In the snippet below, we summarise the results of the binary classification for further analyses.

summarise_results <- function(participant_results) {
  participant_result_summary <- data.frame(row.names = c("PID","Project","TP","FP","TN","FN"),stringsAsFactors=FALSE)
  # Iterating through participants
  for(pid in unique(participant_results$pid)) {
    table_with_checks <- participant_results[participant_results$pid == pid,]
    tp_count <- length(which(table_with_checks$Check == "TP"))
    fp_count <- length(which(table_with_checks$Check == "FP"))
    tn_count <- length(which(table_with_checks$Check == "TN"))
    fn_count <- length(which(table_with_checks$Check == "FN"))
    
    project = participant_results[participant_results$pid == pid,]$Project[1]
    
    participant_summary_row <- data.frame(PID=pid, Project=project, TP=tp_count, FP=fp_count, TN=tn_count, FN=fn_count)
    participant_result_summary <- rbind(participant_summary_row, participant_result_summary)
  }
  return(participant_result_summary)
}

participant_result_summary <- summarise_results(participant_results)
r_participant_result_summary <- summarise_results(r_participant_results)

Statistics of binary classification

Based on the binary classification summaries, we calculate the Accuracy, Misclassification rate, Sensitivity, Specificity, False Positive rate and the Matthews Correlation Coefficient statistics for each participant in both studies.

prs <- participant_result_summary
participant_result_summary$Accuracy <- (prs$TP + prs$TN) / 15 # (TP+TN)/(TP+TN+FP+FN)
participant_result_summary$Misclassification <- (prs$FP + prs$FN) / 15
participant_result_summary$Sensitivity <- (prs$TP)/(prs$TP + prs$FN) # (hit rate)
participant_result_summary$Specificity <- (prs$TN)/(prs$TN + prs$FP)
prs <- participant_result_summary
participant_result_summary$FalsePositiveRate <- 1 - prs$Specificity # False positive rate
participant_result_summary$MCC <- ((prs$TP*prs$TN)-(prs$FP*prs$FN))/sqrt((prs$TP+prs$FP)*(prs$TP+prs$FN)*(prs$TN+prs$FP)*(prs$TN+prs$FN))

prs <- r_participant_result_summary
r_participant_result_summary$Accuracy <- (prs$TP + prs$TN) / 15 # (TP+TN)/(TP+TN+FP+FN)
r_participant_result_summary$Misclassification <- (prs$FP + prs$FN) / 15
r_participant_result_summary$Sensitivity <- (prs$TP)/(prs$TP + prs$FN) # (hit rate)
r_participant_result_summary$Specificity <- (prs$TN)/(prs$TN + prs$FP)
prs <- r_participant_result_summary
r_participant_result_summary$FalsePositiveRate <- 1 - prs$Specificity # False positive rate
r_participant_result_summary$MCC <- ((prs$TP*prs$TN)-(prs$FP*prs$FN))/sqrt((prs$TP+prs$FP)*(prs$TP+prs$FN)*(prs$TN+prs$FP)*(prs$TN+prs$FN))

rm(prs)

The following two tables contain the top 5-5 participants results based on their reached MCC for the original and the replication study, respectively. Note that there were ties in MCC values, thus more than 5 participants are listed.

PID Project TP FP TN FN Accuracy Misclassification Sensitivity Specificity FalsePositiveRate MCC
47 MathNet 3 0 12 0 1.0000000 0.0000000 1 1.0000000 0.0000000 1.0000000
10 NBitcoin 3 0 12 0 1.0000000 0.0000000 1 1.0000000 0.0000000 1.0000000
64 MathNet 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
8 NBitcoin 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
2 NBitcoin 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
11 NBitcoin 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
PID Project TP FP TN FN Accuracy Misclassification Sensitivity Specificity FalsePositiveRate MCC
62 NetTopologySuite 3 0 12 0 1.0000000 0.0000000 1 1.0000000 0.0000000 1.0000000
22 NodaTime 3 0 12 0 1.0000000 0.0000000 1 1.0000000 0.0000000 1.0000000
72 NetTopologySuite 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
52 NetTopologySuite 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
51 NetTopologySuite 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
49 NetTopologySuite 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
45 NetTopologySuite 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
8 NodaTime 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
7 NodaTime 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
32 NodaTime 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
31 NodaTime 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
25 NodaTime 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
18 NodaTime 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562
16 NodaTime 3 1 11 0 0.9333333 0.0666667 1 0.9166667 0.0833333 0.8291562

Visualizing binary classification results

The following box plots are showing different measures of binary classification separated by the projects.

draw_binary_boxplots <- function(participant_result_summary) {
  # Accuracy
  acc <- ggplot(participant_result_summary, aes(x=factor(Project), y=Accuracy)) +
    geom_boxplot() +
    xlab("Project") +
    ylab("Accuracy") +
    guides(fill=FALSE) +
    theme_hc() +
    scale_y_continuous(limits=c(0,1), breaks=seq(0,1,0.1))
  
  # Misclassification
  misc <- ggplot(participant_result_summary, aes(x=factor(Project), y=Misclassification)) +
    geom_boxplot() +
    xlab("Project") +
    ylab("Misclassification") +
    guides(fill=FALSE) +
    theme_hc() +
    scale_y_continuous(limits=c(0,1), breaks=seq(0,1,0.1))
  
  # False positive rate
  fpr <- ggplot(participant_result_summary, aes(x=factor(Project), y=FalsePositiveRate)) +
    geom_boxplot() +
    xlab("Project") +
    ylab("False Positive Rate") +
    guides(fill=FALSE) +
    theme_hc() +
    scale_y_continuous(limits=c(0,1), breaks=seq(0,1,0.1))
  
  # Sensitivity (TPR)
  # IN PAPER
  #pdf(file="boxplot-sensitivity.pdf",width=3.5,height=2.5)
  tpr <- ggplot(participant_result_summary, aes(x=factor(Project), y=Sensitivity)) +
    geom_boxplot() +
    xlab("Project") +
    ylab("TPR") +
    guides(fill=FALSE) +
    theme_hc() +
    scale_y_continuous(limits=c(0,1), breaks=seq(0,1,0.1))
  #dev.off()
  
  # Specificity (true negative rate)
  # IN PAPER
  #pdf(file="boxplot-tnr.pdf",width=3.5,height=2.5)
  tnr <- ggplot(participant_result_summary, aes(x=factor(Project), y=Specificity)) +
    geom_boxplot() +
    xlab("Project") +
    ylab("TNR") + 
    guides(fill=FALSE) +
    theme_hc() +
    scale_y_continuous(limits=c(0,1), breaks=seq(0,1,0.1))
  #dev.off()
  
  # Matthews Correlation Coefficient
  # IN PAPER
  #pdf(file="boxplot-matthews.pdf",width=3.5,height=2.5)
  mcc <- ggplot(participant_result_summary, aes(x=factor(Project), y=MCC)) +
    geom_boxplot() +
    xlab("Project") +
    ylab("MCC") +
    guides(fill=FALSE) +
    theme_hc() +
    scale_y_continuous(limits=c(-1,1), breaks=seq(-1,1,0.2))
  #dev.off()
  
  return(list(acc,misc,fpr,tpr,tnr,mcc))
}

Original study

Replication study

Visualizing individual binary classification results

The following plots visualize all of the answer results (TP, FP, TN, FN) given by participants in both studies.

draw_classification_grid <- function(participant_results) {
  cb_palette <- c("#680008", "#e22828", "#2cba5a", "#01703e")

  project_a <- participant_results[participant_results$pid < 41,]$Project
  project_b <-  participant_results[participant_results$pid > 40,]$Project
  
  # IN PAPER
  #pdf(file="a-tile-map.pdf",width=6.5,height = 3.5)
  a_grid <- ggplot(participant_results[participant_results$pid < 41,], aes(x=factor(pid), y=reorder(factor(TestId),Results.Id), fill=as.factor(Check))) +
    ggtitle(project_a) + 
    geom_tile(alpha=0.8,width=.9, height=.9) +theme(axis.line = element_line(colour = "black"),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      panel.background = element_blank()) + scale_fill_manual(values=cb_palette) + xlab("Participant ID") + ylab("Test ID") + labs(fill="Result")
  #dev.off()
  
  # IN PAPER
  #pdf(file="b-tile-map.pdf",width=5.5,height = 3.5)
  b_grid <- ggplot(participant_results[participant_results$pid > 40,], aes(x=factor(pid), y=reorder(factor(TestId),Results.Id), fill=as.factor(Check))) +
    ggtitle(project_b) + 
    geom_tile(alpha=0.8,width=.9, height=.9) +theme(axis.line = element_line(colour = "black"),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      panel.background = element_blank()) + scale_fill_manual(values=cb_palette) + xlab("Participant ID") + ylab("Test ID") + labs(fill="Result")
  #dev.off()
  
  return(list(a_grid,b_grid))
}

Original study

Original study

Answers from exit questionnaire

The following two charts show the participant’s answers for the exit survey. They questions were regarding with their feelings and impressions after the task.


questions = c(
    "I had enough time to understand the class.",
    "I had enough time to review the tests.",
    "It was easy to understand the class.",
    "It was easy to understand the tests.",
    "I am certain I chose the right answers.",
    "Tests are difficult to read.",
    "Tests are too long to understand.",
    "Tests are too short to exercise useful behavior.",
    "Tests had too many assertions.",
    "Tests had meaningful assertions.",
    "It was easy to select faulty tests."
  )

options = c(
    "Fully\nagree",
    "Partially\nagree",
    "Neither agree\nnor disagree",
    "Partially\ndisagree",
    "Fully\ndisagree"
  )

extract_likert_data <- function(answers) {
  answers_corrected <- answers[answers$agreement2 != "",]

  data <- subset(answers_corrected, select = c(agreement1, agreement2, agreement3, agreement4, agreement5, agreement6, agreement7, agreement8, agreement9, agreement10, agreement11)
  )
  
  names(data) <- questions
 
  for(i in 1:length(questions)){
     data[data[,i] == 'fagree',i] <- "Fully\nagree"
     data[data[,i] == 'pagree',i] <- "Partially\nagree"
     data[data[,i] == 'neither',i] <- "Neither agree\nnor disagree"
     data[data[,i] == 'pdisagree',i] <- "Partially\ndisagree"
     data[data[,i] == 'fdisagree',i] <- "Fully\ndisagree"
     data[,i] <-  factor(data[,i], levels = options)
  }
  
  return(data)
}

Original study

likert_data <- extract_likert_data(exit_answers)

ldt <- likert(subset(likert_data, select=questions), nlevels = length(options))

# IN PAPER
#pdf(file="c:\\PhD\\Repos\\paper-wbstudy-ist\\figures\\likert.pdf", width=12)
plot(ldt, ordered=FALSE) + theme(legend.title=element_blank(),legend.position = "bottom")

#dev.off()

Replication study

likert_data_rep <- extract_likert_data(r_exit_answers)

ldt <- likert(subset(likert_data_rep, select=questions), nlevels = length(options))
# IN PAPER
#pdf(file="c:\\PhD\\Repos\\paper-wbstudy-ist\\figures\\likert-rep.pdf", width=12)
plot(ldt, ordered=FALSE) + theme(legend.title=element_blank(),legend.position = "bottom")

#dev.off()

Evaluating participant activities

To obtain knowledge about how participants behaved during the studies, we extract their actions from the video logs (in case of the original study), and from the activity logs (in case of the replication study).

Extracting original study activities from video logs

As we had technical difficulties with logging the activities in the first study, we reconstructed the logs from the videos by coding each activity to analyze.

extended_video_annotations <- data.frame(row.names = c("PID","Timestamp","Behavior","Modifier","Active","Page","Window","RunCase","DebugLength","DebugCase"),stringsAsFactors = FALSE)
vs_time_summary <- data.frame(row.names = c("PID","T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5","S","C","P","Total"), stringsAsFactors = FALSE)
portal_time_summary <- data.frame(row.names = c("PID","T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"), stringsAsFactors = FALSE)
marked_wrong_summary <- data.frame(row.names = c("PID","T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"), stringsAsFactors = FALSE)
marked_ok_summary <- data.frame(row.names = c("PID","T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"), stringsAsFactors = FALSE)
cut_time_for_test <- data.frame(row.names = c("PID","T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"), stringsAsFactors = FALSE)
sut_time_for_test <- data.frame(row.names = c("PID","T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"), stringsAsFactors = FALSE)
put_time_for_test <- data.frame(row.names = c("PID","T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"), stringsAsFactors = FALSE)

# Iterating through participants
for(pid in unique(result_answers$pid)) {
  annotations_for_participant <- result_videos[result_videos$pid == pid,]
  vs_time_summary_row <- data.frame(PID = pid, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0, S = 0, C = 0, P = 0, Total = 0, stringsAsFactors = FALSE)
  portal_time_summary_row <- data.frame(PID = pid, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0)
  marked_wrong_summary_row <- data.frame(PID = pid, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0)
  marked_ok_summary_row <- data.frame(PID = pid, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0)
  cut_time_for_test_row <- data.frame(PID = pid, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0)
  sut_time_for_test_row <- data.frame(PID = pid, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0)
  put_time_for_test_row <- data.frame(PID = pid, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0)
  # Iterating through the annotations of the participant
  currently_active <- NA
  vs_window <- NA
  portal_page <- NA
  debug_start_index <- NA
  vs_start_index <- -1 # -1: VS has not been activated yet, otherwise the row index of activation point
  window_start_index <- -1 # -1: A window in VS has not been activated yet, otherwise the row index of activation point
  page_start_index <- -1 # -1: A page in the portal has not been activated yet, otherwise the row index of activation point
  
  # Iterating through each row of a participant
  for(i in 1:nrow(annotations_for_participant)) {
    # Getting the corresponding row
    row <- annotations_for_participant[i,]
    # If the row is na somehow (due to some R bug)
    if(is.na(row$pid)) { 
      break; 
    } 
    marked_case <- NA
    run_case <- NA
    debug_time <- NA
    debug_case <- NA
    # If the portal has been activated
    if(row$Behavior == "Portal activated") {
      currently_active <- "Portal" # Setting the currently active variable to Portal
      if(is.na(portal_page)) {
        # If the portal is activated and there were no previous pages, then the home page (H) will open
        portal_page = "H"
      }
      page_start_index <- i
      ##### VS full timer summarization #####
      # If there was a VS activation before, it must be stopped and added to the summary
      if(vs_start_index != -1) {
        # The elapsed time is the current row timestamp minus the previous VS activation timestamp
        elapsed_vs_time <- row$Time - annotations_for_participant[annotations_for_participant$pid == pid,][vs_start_index,]$Time
        # The elapsed time is added to the row
        vs_time_summary_row$Total <- vs_time_summary_row$Total + elapsed_vs_time
      }
      vs_start_index <- NA # There is no start index for VS, waiting for the next
      
      ##### VS window timer summarization #####
      # If there was a VS window activation before, it must stopped and added to the summary
      if(window_start_index != -1) {
        # Getting the row of previous window activation in VS
        window_start_row <- extended_video_annotations[extended_video_annotations$PID == pid,][window_start_index,]
        # Calculating the elapsed time in that particular window
        elapsed_window_time <- row$Time - window_start_row$Time
        
        # If the window was not C, S, N, P or was not NA (it is a numbered test case)
        if(window_start_row$Window != "C" && window_start_row$Window != "S" && window_start_row$Window != "N" && window_start_row$Window != "P" && !is.na(window_start_row$Window)) {
          # e.g., TC0 -> 2. column index
          vs_time_summary_row[,as.numeric(window_start_row$Window)+2] <- vs_time_summary_row[,as.numeric(window_start_row$Window)+2] + elapsed_window_time 
        } else if(window_start_row$Window != "N"  && !is.na(window_start_row$Window)) {
          vs_time_summary_row[,window_start_row$Window] <- vs_time_summary_row[,window_start_row$Window] + elapsed_window_time 

          # Summarizing class time for given test
          if(!is.na(window_start_row$Page) && window_start_row$Page != "H") {
            if(window_start_row$Window == "C") {
              cut_time_for_test_row[,as.numeric(window_start_row$Page)+2] <- cut_time_for_test_row[,as.numeric(window_start_row$Page)+2] + elapsed_window_time
            }
            if(window_start_row$Window == "S") {
              sut_time_for_test_row[,as.numeric(window_start_row$Page)+2] <- sut_time_for_test_row[,as.numeric(window_start_row$Page)+2] + elapsed_window_time
            }
            if(window_start_row$Window == "P") {
              put_time_for_test_row[,as.numeric(window_start_row$Page)+2] <- put_time_for_test_row[,as.numeric(window_start_row$Page)+2] + elapsed_window_time
            }
          }
        }
      }
      window_start_index <- NA
    } else if(row$Behavior == "VS activated") {
      currently_active <- "VS"
      # Start VS timer
      vs_start_index <- i
      window_start_index <- i
      # stopping Portal timers
      if(page_start_index != -1) {
        page_start_row <- extended_video_annotations[extended_video_annotations$PID == pid,][page_start_index,]
        elapsed_page_time <- row$Time - page_start_row$Time
        if(page_start_row$Page != "H" && !is.na(page_start_row$Page)) {
            # e.g., TC0 -> 2. column index
            portal_time_summary_row[,as.numeric(page_start_row$Page)+2] <- portal_time_summary_row[,as.numeric(page_start_row$Page)+2] + elapsed_page_time 
        }
      }
      page_start_index <- NA
    } else if(row$Behavior == "Changed page in portal") {
      portal_page <- row$Modifier.1
      if(page_start_index != -1) {
        page_start_row <- extended_video_annotations[extended_video_annotations$PID == pid,][page_start_index,]
        elapsed_page_time <- row$Time - page_start_row$Time
        if(page_start_row$Page != "H" && !is.na(page_start_row$Page)) {
            # e.g., TC0 -> 2. column index
            portal_time_summary_row[,as.numeric(page_start_row$Page)+2] <- portal_time_summary_row[,as.numeric(page_start_row$Page)+2] + elapsed_page_time 
        }
      }
      page_start_index <- i
    } else if(row$Behavior == "Changed window in VS") {
      vs_window <- row$Modifier.1
      if(window_start_index != -1) {
        window_start_row <- extended_video_annotations[extended_video_annotations$PID == pid,][window_start_index,]
        elapsed_window_time <- row$Time - window_start_row$Time
        if(window_start_row$Window != "C" && window_start_row$Window != "S" && window_start_row$Window != "N" && window_start_row$Window != "P" && !is.na(window_start_row$Window)) {
          # e.g., TC0 -> 2. column index
          vs_time_summary_row[,as.numeric(window_start_row$Window)+2] <- vs_time_summary_row[,as.numeric(window_start_row$Window)+2] + elapsed_window_time 
        } else if(window_start_row$Window != "N" && !is.na(window_start_row$Window)) {
          vs_time_summary_row[,window_start_row$Window] <- vs_time_summary_row[,window_start_row$Window] + elapsed_window_time 
        }
      }
      window_start_index <- i
    } else if(row$Behavior == "Marked as OK") {
      marked_case <- portal_page
      marked_ok_summary_row[,as.numeric(marked_case)+2] <- marked_ok_summary_row[,as.numeric(marked_case)+2] +1
    } else if(row$Behavior == "Marked as WRONG") {
      marked_case <- portal_page
      marked_wrong_summary_row[,as.numeric(marked_case)+2] <- marked_wrong_summary_row[,as.numeric(marked_case)+2] + 1
    } else if(row$Behavior == "Running test") {
      run_case <- vs_window 
    } else if(row$Behavior == "Submit") {
    } else if(row$Behavior == "Other event") {
    } else if(row$Behavior == "Debug test") {
      if(row$Modifier.1 == "S") {
        debug_start_index <- i
        debug_case <- vs_window
      } else if(row$Modifier.1 == "E") {
        debug_case <- extended_video_annotations[extended_video_annotations$PID == pid,][debug_start_index,]$DebugCase
        debug_time <- row$Time - annotations_for_participant[annotations_for_participant$pid == pid,][debug_start_index,]$Time
        debug_start_index <- NA
      }
    } else if(row$Behavior == "Remove answer") {
      marked_case <- row$Modifier.1
    } else if(row$Behavior == "Missing test problem") {
      if(row$Modifier.1 == "S") {
        window_start_row <- extended_video_annotations[extended_video_annotations$PID == pid,][window_start_index,]
        elapsed_window_time <- row$Time - window_start_row$Time
        if(window_start_row$Window != "C" && window_start_row$Window != "S" && window_start_row$Window != "N" && window_start_row$Window != "P" && !is.na(window_start_row$Window)) {
          # e.g., TC0 -> 2. column index
          vs_time_summary_row[,as.numeric(window_start_row$Window)+2] <- vs_time_summary_row[,as.numeric(window_start_row$Window)+2] + elapsed_window_time 
        } else if(window_start_row$Window != "N" && !is.na(window_start_row$Window)) {
          vs_time_summary_row[,window_start_row$Window] <- vs_time_summary_row[,window_start_row$Window] + elapsed_window_time 
        }
      } else if(row$Modifier.1 == "E") {
        window_start_index <- i
      }
    }
    extended_row <- data.frame(PID=pid,Timestamp=row$Time,Behavior=row$Behavior,Modifier=row$Modifier.1,Active=currently_active,Page=portal_page,Window=vs_window,RunCase=run_case,DebugLength=debug_time,DebugCase=debug_case,stringsAsFactors = FALSE)
    extended_video_annotations <- rbind(extended_video_annotations,extended_row)
  }
  portal_time_summary <- rbind(portal_time_summary, portal_time_summary_row)
  vs_time_summary <- rbind(vs_time_summary, vs_time_summary_row)
  marked_ok_summary <- rbind(marked_ok_summary, marked_ok_summary_row)
  marked_wrong_summary <- rbind(marked_wrong_summary, marked_wrong_summary_row)
  cut_time_for_test <- rbind(cut_time_for_test, cut_time_for_test_row)
  sut_time_for_test <- rbind(sut_time_for_test, sut_time_for_test_row)
  put_time_for_test <- rbind(put_time_for_test, put_time_for_test_row)
}
portal_time_summary$check_sums <- rowSums(portal_time_summary[,c("T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5")])
vs_time_summary$check_sums <- rowSums(vs_time_summary[,c("T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5","S","C","P")])

project_column <- data.frame()
for(i in 1:nrow(vs_time_summary)) {
  if(as.numeric(vs_time_summary[i,]$PID) > 40) {
    project_column[i,"Project"] <- "MathNet"
  } else {
    project_column[i,"Project"] <- "NBitcoin"
  }
}
vs_time_summary_with_projects <- cbind(vs_time_summary[order(vs_time_summary$PID),],project_column)

project_column <- data.frame()
for(i in 1:nrow(portal_time_summary)) {
  if(as.numeric(portal_time_summary[i,]$PID) > 40) {
    project_column[i,"Project"] <- "MathNet"
  } else {
    project_column[i,"Project"] <- "NBitcoin"
  }
}
portal_time_summary_with_projects <- cbind(portal_time_summary[order(portal_time_summary$PID),],project_column)

# Tests individually
time_spent_tests_vs <- melt(vs_time_summary_with_projects,id.vars=c("PID","Project"), measure.vars=c("T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"),variable.name="Window",value.name="Time")
time_spent_tests_vs <- time_spent_tests_vs[order(time_spent_tests_vs$PID,time_spent_tests_vs$Window),]

time_spent_tests_portal <- melt(portal_time_summary_with_projects,id.vars=c("PID","Project"), measure.vars=c("T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"),variable.name="Page",value.name="Time")
time_spent_tests_portal <- time_spent_tests_portal[order(time_spent_tests_portal$PID,time_spent_tests_portal$Page),]

time_spent_tests <- time_spent_tests_vs
time_spent_tests$Time = time_spent_tests$Time + time_spent_tests_portal$Time

time_spent_others <- melt(vs_time_summary_with_projects,id.vars=c("PID","Project"), measure.vars=c("S","C","P"),variable.name="Window",value.name="Time")

time_spent_tests_nbitcoin <- time_spent_tests[time_spent_tests$Project == "NBitcoin",]
time_spent_tests_nbitcoin$TestCategory <- ifelse(time_spent_tests_nbitcoin$Window == "T5.2","Faulty", ifelse(time_spent_tests_nbitcoin$Window == "T7.3" | time_spent_tests_nbitcoin$Window == "T10.4","Faulty", ifelse(time_spent_tests_nbitcoin$Window == "T13.5" | time_spent_tests_nbitcoin$Window == "T14.5", "Not faulty", "Not faulty")))
time_spent_tests_mathnet <- time_spent_tests[time_spent_tests$Project == "MathNet",]
time_spent_tests_mathnet$TestCategory <- ifelse(time_spent_tests_mathnet$Window == "T2.1","Faulty", ifelse(time_spent_tests_mathnet$Window == "T4.2" | time_spent_tests_mathnet$Window == "T11.4","Faulty", ifelse(time_spent_tests_mathnet$Window == "T10.4" | time_spent_tests_mathnet$Window == "T14.5", "Not faulty", "Not faulty")))

# Tests grouped by methods
vs_time_summary_with_projects <- vs_time_summary_with_projects[order(vs_time_summary_with_projects$PID),]
portal_time_summary_with_projects <- portal_time_summary_with_projects[order(portal_time_summary_with_projects$PID),]
pts <- portal_time_summary_with_projects
vts <- vs_time_summary_with_projects

time_spent_tests_grouped <- data.frame(M1=c(1:54),M2=c(1:54),M3=c(1:54),M4=c(1:54),M5=c(1:54))
time_spent_tests_grouped$M1 <- (pts$T0.1+pts$T1.1+pts$T2.1)+(vts$T0.1+vts$T1.1+vts$T2.1) 
time_spent_tests_grouped$M2 <- (pts$T3.2+pts$T4.2+pts$T5.2)+(vts$T3.2+vts$T4.2+vts$T5.2) 
time_spent_tests_grouped$M3 <- (pts$T6.3+pts$T7.3+pts$T8.3)+(vts$T6.3+vts$T7.3+vts$T8.3)
time_spent_tests_grouped$M4 <- (pts$T9.4+pts$T10.4+pts$T11.4)+(vts$T9.4+vts$T10.4+vts$T11.4)
time_spent_tests_grouped$M5 <- (pts$T12.5+pts$T13.5+pts$T14.5)+(vts$T12.5+vts$T13.5+vts$T14.5) 

# Other windows - SUT, CUT and PUT
time_spent_others_nbitcoin <- time_spent_others[time_spent_others$Project == "NBitcoin",]
time_spent_others_mathnet <- time_spent_others[time_spent_others$Project == "MathNet",]

# Joining the dataset
summarized_test_times_portal <- time_spent_tests_portal %>% group_by(PID, Project) %>% summarize(Time=sum(Time)) %>% bind_cols(data.frame(Location=rep("Portal",54)))
summarized_test_time_vs <- time_spent_tests_vs %>% group_by(PID, Project) %>% summarize(Time=sum(Time)) %>% bind_cols(data.frame(Location=rep("VS",54)))
names(time_spent_others)[3] <- "Location"

full_summarized_times <- summarized_test_times_portal %>% bind_rows(summarized_test_time_vs) %>% bind_rows(time_spent_others)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
full_summarized_times$Location <- factor(full_summarized_times$Location, levels=c("Portal", "VS", "C", "S", "P"))
levels(full_summarized_times$Location) <- list("Portal"="Portal", "Test code"="VS", "CUT" = "C", "SUT"="S","PUT"="P")

rm(cut_time_for_test_row,extended_row,marked_ok_summary_row, marked_wrong_summary_row, page_start_row,portal_time_summary_row,put_time_for_test_row,row,sut_time_for_test_row,vs_time_summary_row,window_start_row,currently_active,debug_case,debug_start_index,debug_time,elapsed_page_time,elapsed_vs_time,elapsed_window_time,i,marked_case,page_start_index,pid,portal_page,run_case,vs_start_index,vs_window,window_start_index)

Extracting replication study activities from activity logs

The participant activities were successfully recorded in the replicated study, thus we extract those from the log files.

# Perform the same analytics as in the original study

summarize_logs <- function(log, start_time, testFile_test_mapping) {
  
  work_length <- tibble()
  vs_time_summary_rep <- tibble()
  portal_time_summary_rep <- tibble()
  sut_time_summary_rep <- tibble()
  cut_time_summary_rep <- tibble()
  put_time_summary_rep <- tibble()
  
  series <- log %>% filter(strptime(Date, format = "%m/%d/%Y %I:%M:%S %p") > start_time)
  series <- tibble::rowid_to_column(series, "ID")
  
  new_series <- tibble()
  new_series_vs <- tibble()
  new_series_vsp <- tibble()
  
  for(pid in unique(series$PID)) {
    pid_activities <- series %>% filter(PID == pid)
    
    pid_work_length <- tibble(PID = pid, Time = strptime( last(pid_activities$Date), format = "%m/%d/%Y %I:%M:%S %p") - strptime( pid_activities[1,]$Date, format = "%m/%d/%Y %I:%M:%S %p"))
    
    work_length <- work_length %>% bind_rows(pid_work_length)
    
    if(pid_activities[1,]$Action == "ActivateWindow") {
      active <- "VS"
    } else {
      active <- "Portal"
    }
    
    # Active location
    for(i in pid_activities$ID) {
      if(series[i,]$Action == "ActivateWindow" || series[i,]$Action == "BrowserFocusLost") {
        active <- "VS"
      } else if(series[i,]$Action == "BrowserFocused" || series[i,]$Action == "Index" || series[i,]$Action == "Code" || series[i,]$Action == "Marked" || series[i,]$Action == "Submit") {
        active <- "Portal"
      } 
      #print(active)
      to_modify <- series %>% filter(ID == i) %>% mutate(Active = active)
      new_series <- new_series %>% bind_rows(to_modify)
    }
    
    # Active VS window
    vs_active <- ""
    for(i in pid_activities$ID) {
      if(series[i,]$Action == "ActivateWindow") {
        vs_active <- testFile_test_mapping(series[i,]$Location)
      }
      to_modify <- new_series %>% filter(ID == i) %>% mutate(ActiveVS = vs_active)
      new_series_vs <- new_series_vs %>% bind_rows(to_modify)
    }
    
    # Active Portal window
    p_active <- "Index"
    for(i in pid_activities$ID) {
      if(series[i,]$Action == "Index") {
        p_active <- "Index"
      } else if(series[i,]$Action == "Code") {
        p_active <- testSeq_to_seq(series[i,]$Location)
      }
      to_modify <- new_series_vs %>% filter(ID == i) %>% mutate(ActiveP = p_active)
      new_series_vsp <- new_series_vsp %>% bind_rows(to_modify)
    }
    
    
    # Summarizing each location time
    vs_time_summary_row <- data.frame(PID = pid, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0, SUT = 0, CUT = 0, PUT = 0, Total = 0, stringsAsFactors = FALSE)
    portal_time_summary_row <- data.frame(PID = pid, Index = 0, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0, stringsAsFactors = FALSE)
    sut_time_summary_row <- data.frame(PID = pid, Index = 0, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0, stringsAsFactors = FALSE)
    cut_time_summary_row <- data.frame(PID = pid, Index = 0, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0, stringsAsFactors = FALSE)
    put_time_summary_row <- data.frame(PID = pid, Index = 0, T0.1 = 0, T1.1 = 0, T2.1 = 0, T3.2 = 0, T4.2 = 0, T5.2 = 0, T6.3 = 0, T7.3 = 0, T8.3 = 0, T9.4 = 0, T10.4 = 0, T11.4 = 0, T12.5 = 0, T13.5 = 0, T14.5 = 0, stringsAsFactors = FALSE)

    last_change_time <- strptime(new_series_vsp[pid_activities[1,]$ID,]$Date, format = "%m/%d/%Y %I:%M:%S %p") # THe last change time is the first at start
    for(id in pid_activities[-1,]$ID) {
      # if the main location changes -> from vs to p and vica versa
      lhs <- new_series_vsp[id-1,]
      rhs <- new_series_vsp[id,]
      diff <- strptime(rhs$Date, format = "%m/%d/%Y %I:%M:%S %p") - last_change_time
      if(lhs$Active != rhs$Active) {
        if(lhs$Active == "Portal") {
          # P -> VS
          portal_time_summary_row[lhs$ActiveP] <- portal_time_summary_row[lhs$ActiveP] + as.numeric(diff) 
        } else {
          # VS -> P
          if(lhs$ActiveVS != "") {
            if(lhs$ActiveVS == "SUT") {
              sut_time_summary_row[lhs$ActiveP] <- sut_time_summary_row[lhs$ActiveP] + as.numeric(diff)
            } else if(lhs$ActiveVS == "PUT") {
              put_time_summary_row[lhs$ActiveP] <- put_time_summary_row[lhs$ActiveP] + as.numeric(diff)
            } else if(lhs$ActiveVS == "CUT") {
              cut_time_summary_row[lhs$ActiveP] <- cut_time_summary_row[lhs$ActiveP] + as.numeric(diff)
            }
            vs_time_summary_row[lhs$ActiveVS] <- vs_time_summary_row[lhs$ActiveVS] + as.numeric(diff)
          }
        }
        last_change_time <- strptime(rhs$Date, format = "%m/%d/%Y %I:%M:%S %p")
      } else if((lhs$Active == "Portal") & (lhs$ActiveP != rhs$ActiveP)) {
        # P[x] -> P[y]
        portal_time_summary_row[lhs$ActiveP] <- portal_time_summary_row[lhs$ActiveP] + as.numeric(diff)
        last_change_time <- strptime(rhs$Date, format = "%m/%d/%Y %I:%M:%S %p")
      } else if((lhs$Active == "VS") & (lhs$ActiveVS != rhs$ActiveVS)) {
        # VS[x] -> VS[y]
        if(lhs$ActiveVS != "") {
          if(lhs$ActiveVS == "SUT") {
              sut_time_summary_row[lhs$ActiveP] <- sut_time_summary_row[lhs$ActiveP] + as.numeric(diff)
            } else if(lhs$ActiveVS == "PUT") {
              put_time_summary_row[lhs$ActiveP] <- put_time_summary_row[lhs$ActiveP] + as.numeric(diff)
            } else if(lhs$ActiveVS == "CUT") {
              cut_time_summary_row[lhs$ActiveP] <- cut_time_summary_row[lhs$ActiveP] + as.numeric(diff)
          }
          vs_time_summary_row[lhs$ActiveVS] <- vs_time_summary_row[lhs$ActiveVS] + as.numeric(diff)
        }
        last_change_time <- strptime(rhs$Date, format = "%m/%d/%Y %I:%M:%S %p")
      }
    }
    vs_time_summary_rep <- vs_time_summary_rep %>% bind_rows(vs_time_summary_row)
    portal_time_summary_rep <- portal_time_summary_rep %>% bind_rows(portal_time_summary_row)
    sut_time_summary_rep <- sut_time_summary_rep %>% bind_rows(sut_time_summary_row)
    cut_time_summary_rep <- cut_time_summary_rep %>% bind_rows(cut_time_summary_row)
    put_time_summary_rep <- put_time_summary_rep %>% bind_rows(put_time_summary_row)
  }
  vs_time_summary_rep <- vs_time_summary_rep %>% mutate(Total = T0.1+T1.1+T2.1+T3.2+T4.2+T5.2+T6.3+T7.3+T8.3+T9.4+T10.4+T11.4+T12.5+T13.5+T14.5+CUT+PUT+SUT)
  portal_time_summary_rep <- portal_time_summary_rep %>% mutate(Total = T0.1+T1.1+T2.1+T3.2+T4.2+T5.2+T6.3+T7.3+T8.3+T9.4+T10.4+T11.4+T12.5+T13.5+T14.5)
  sut_time_summary_rep <- sut_time_summary_rep %>% mutate(Total = T0.1+T1.1+T2.1+T3.2+T4.2+T5.2+T6.3+T7.3+T8.3+T9.4+T10.4+T11.4+T12.5+T13.5+T14.5)
  cut_time_summary_rep <- cut_time_summary_rep %>% mutate(Total = T0.1+T1.1+T2.1+T3.2+T4.2+T5.2+T6.3+T7.3+T8.3+T9.4+T10.4+T11.4+T12.5+T13.5+T14.5)
  put_time_summary_rep <- put_time_summary_rep %>% mutate(Total = T0.1+T1.1+T2.1+T3.2+T4.2+T5.2+T6.3+T7.3+T8.3+T9.4+T10.4+T11.4+T12.5+T13.5+T14.5)

  
  return(list(vs_time_summary_rep, portal_time_summary_rep, sut_time_summary_rep, cut_time_summary_rep, put_time_summary_rep,work_length))
}

output_nodatime <- summarize_logs(nodatime_log, nodatime_start_time, nodatime_test_to_seq)
nodatime_vs <- output_nodatime[[1]] %>% mutate(Project = "NodaTime")
nodatime_portal <- output_nodatime[[2]] %>% mutate(Project = "NodaTime")
nodatime_sut_tests <- output_nodatime[[3]] %>% mutate(Project = "NodaTime")
nodatime_cut_tests <- output_nodatime[[4]] %>% mutate(Project = "NodaTime")
nodatime_put_tests <- output_nodatime[[5]] %>% mutate(Project = "NodaTime")
nodatime_work_length <- output_nodatime[[6]] %>% mutate(Project = "NodaTime")

output_nettopology <- summarize_logs(nettopology_log, nettopology_start_time, nettopology_test_to_seq)
nettopology_vs <- output_nettopology[[1]] %>% mutate(Project = "NetTopology")
nettopology_portal <- output_nettopology[[2]] %>% mutate(Project = "NetTopology")
nettopology_sut_tests <- output_nettopology[[3]] %>% mutate(Project = "NetTopology")
nettopology_cut_tests <- output_nettopology[[4]] %>% mutate(Project = "NetTopology")
nettopology_put_tests <- output_nettopology[[5]] %>% mutate(Project = "NetTopology")
nettopology_work_length <- output_nettopology[[6]] %>% mutate(Project = "NetTopology")

# Calculating work length statistics
nodatime_work_stats <- nodatime_work_length %>% summarize(mean=mean(Time), median=median(Time), sd=sd(Time), min=min(Time), max=max(Time))
nettopology_work_stats <- nettopology_work_length %>% summarize(mean=mean(Time), median=median(Time), sd=sd(Time), min=min(Time), max=max(Time))

# Binding the two projects
rep_portal <- nodatime_portal %>% bind_rows(nettopology_portal)
rep_vs <- nodatime_vs %>% bind_rows(nettopology_vs)
rep_sut_tests <- nodatime_sut_tests %>% bind_rows(nettopology_sut_tests)
rep_cut_tests <- nodatime_cut_tests %>% bind_rows(nettopology_cut_tests)
rep_put_tests <- nodatime_put_tests %>% bind_rows(nettopology_put_tests)

# Melting
rep_portal_time <- melt(rep_portal,id.vars=c("PID","Project"), measure.vars=c("T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"),variable.name="Page",value.name="Time")
rep_vs_time <- melt(rep_vs,id.vars=c("PID","Project"), measure.vars=c("T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"),variable.name="Window",value.name="Time")
rep_others_time <- melt(rep_vs, id.vars=c("PID","Project"),
                     measure.vars=c("SUT","CUT","PUT"),variable.name="Location",value.name="Time")


# Mark faulty tests
rep_portal_time <- rep_portal_time %>% mutate(Faulty = FALSE)
rep_vs_time <- rep_vs_time %>% mutate(Faulty = FALSE)

rep_portal_time[rep_portal_time$Project == "NodaTime" & (rep_portal_time$Page == "T2.1" | rep_portal_time$Page == "T6.3" | rep_portal_time$Page == "T12.5"),]$Faulty = TRUE
rep_portal_time[rep_portal_time$Project == "NetTopology" & (rep_portal_time$Page == "T0.1" | rep_portal_time$Page == "T4.2" | rep_portal_time$Page == "T10.4"),]$Faulty = TRUE

rep_vs_time[rep_vs_time$Project == "NodaTime" & (rep_vs_time$Window == "T2.1" | rep_vs_time$Window == "T6.3" | rep_vs_time$Window == "T12.5"),]$Faulty = TRUE
rep_vs_time[rep_vs_time$Project == "NetTopology" & (rep_vs_time$Window == "T0.1" | rep_vs_time$Window == "T4.2" | rep_vs_time$Window == "T10.4"),]$Faulty = TRUE

# Joining the dataset
summarized_rep_portal <- rep_portal_time %>% group_by(PID, Project) %>% summarize(Time=sum(Time)) %>% bind_cols(data.frame(Location=rep("Portal",52), stringsAsFactors = FALSE))
summarized_rep_vs <- rep_vs_time %>% group_by(PID, Project) %>% summarize(Time=sum(Time)) %>% bind_cols(data.frame(Location=rep("VS",52), stringsAsFactors = FALSE))


full_summarized_times_rep <- summarized_rep_portal %>% bind_rows(summarized_rep_vs) %>% bind_rows(rep_others_time)
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
full_summarized_times_rep$Location <- factor(full_summarized_times_rep$Location, levels=c("Portal", "VS", "CUT", "SUT", "PUT"))
full_summarized_times_rep$Location <- plyr::revalue(full_summarized_times_rep$Location, c("VS"="Test code", "gamma"="three"))
## The following `from` values were not present in `x`: gamma

Visualizing full time spent

The following boxplots visualize the time spent by participants at each possible location.

Original study

# IN PAPER
#pdf(file="full-time-spent-nbitcoin.pdf",width=4,height = 2.5)
ggplot(data=filter(full_summarized_times, Project=="NBitcoin"), aes(x=Location,y=Time)) +geom_boxplot() + theme_hc() + scale_y_continuous(limits=c(0,1600), breaks=seq(0,1500,250)) + ylab("Time [s]")
## Warning: Removed 7 rows containing non-finite values (stat_boxplot).

#dev.off()

# IN PAPER
#pdf(file="full-time-spent-mathnet.pdf",width=4,height = 2.5)
ggplot(data=filter(full_summarized_times, Project=="MathNet"), aes(x=Location,y=Time)) +geom_boxplot() + theme_hc()+ scale_y_continuous(limits=c(0,1600), breaks=seq(0,1500,250)) + ylab("Time [s]")

#dev.off()

Replication study

# IN PAPER
#pdf(file="C:\\PhD\\Repos\\paper-wbstudy-ist\\figures\\full-time-spent-nodatime.pdf",width=4,height = 2.5)
ggplot(data=filter(full_summarized_times_rep, Project=="NodaTime"), aes(x=Location,y=Time)) +geom_boxplot() + theme_hc() + scale_y_continuous(limits=c(0,1600), breaks=seq(0,1500,250)) + ylab("Time [s]")

#dev.off()

# IN PAPER
#pdf(file="C:\\PhD\\Repos\\paper-wbstudy-ist\\figures\\full-time-spent-nettopology.pdf",width=4,height = 2.5)
ggplot(data=filter(full_summarized_times_rep, Project=="NetTopology"), aes(x=Location,y=Time)) +geom_boxplot() + theme_hc()+ scale_y_continuous(limits=c(0,1600), breaks=seq(0,1500,250)) + ylab("Time [s]")

#dev.off()

Full time spent on each test

The following two boxplots show the full time invested to each test case by summing the time spent at each possible location.


time_summary_data <- function(first_project,
                              second_project,
                              first_faulty_tests,
                              second_faulty_tests,
                              p_cut_tests,
                              p_put_tests,
                              p_sut_tests,
                              p_vs_time,
                              p_portal_time,
                              p_result_answers) {
  
  cut_times <- melt(p_cut_tests,id.vars = c("PID"), measure.vars=c("T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"), variable.name = "Test",value.name="Time")
  all_times_for_tests <- cut_times
  put_times <- melt(p_put_tests, id.vars = c("PID"), measure.vars=c("T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"), variable.name = "Test",value.name="Time")
  all_times_for_tests <- rbind(all_times_for_tests, put_times)
  sut_times <- melt(p_sut_tests, id.vars = c("PID"), measure.vars=c("T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"), variable.name = "Test",value.name="Time")
  all_times_for_tests <- rbind(all_times_for_tests, sut_times)
  
  vs_time_for_tests <- subset(p_vs_time, select=c(PID,Window,Time))
  names(vs_time_for_tests) <- c("PID","Test","Time")
  
  portal_time_for_tests <- subset(p_portal_time, select=c(PID,Page,Time))
  names(portal_time_for_tests) <- c("PID","Test","Time")
  
  all_times_for_tests <- rbind(all_times_for_tests, vs_time_for_tests)
  all_times_for_tests <- rbind(all_times_for_tests, portal_time_for_tests)

  all_times_for_tests <- cbind(all_times_for_tests, c(rep("CUT",nrow(cut_times)),rep("PUT",nrow(put_times)),rep("SUT",nrow(sut_times)),rep("VS",nrow(vs_time_for_tests)),rep("Portal",nrow(portal_time_for_tests))))
  names(all_times_for_tests)[4] <- "Location"
  
  all_times_for_tests <- cbind(all_times_for_tests, c(rep(NA,nrow(all_times_for_tests))))
  names(all_times_for_tests)[5] <- "Project"
  all_times_for_tests$Project[as.numeric(all_times_for_tests$PID) > 40] <- second_project
  all_times_for_tests$Project[as.numeric(all_times_for_tests$PID) < 41] <- first_project
  all_times_for_tests$PID <- as.numeric(all_times_for_tests$PID)
  
  if(first_project == "NBitcoin" && second_project == "MathNet") {
    all_times_for_tests <- dplyr::filter(all_times_for_tests, !((PID == 59 & (Test == "T0.1" | Test == "T1.1")) | (PID == 55 & (Test == "T0.1" | Test == "T1.1")) ))
  }
  
  jj <- inner_join(x=all_times_for_tests, y=p_result_answers, by=c("PID" = "pid", "Test" = "TestId", "Project" = "Project"))
  
  time_summary_plot <- jj %>%
  group_by(PID,Test,Project,Method) %>%
  summarise(Time=sum(Time)) %>%
    # TODO Check if this works as expected
  mutate(Faulty=ifelse( (Project==first_project && Test %in% first_faulty_tests) || (Project==second_project && Test %in% second_faulty_tests)  ,"YES","NO"))

  time_summary_plot$Test <- factor(time_summary_plot$Test, levels=c("T0.1","T1.1","T2.1","T3.2","T4.2","T5.2","T6.3","T7.3","T8.3","T9.4","T10.4","T11.4","T12.5","T13.5","T14.5"))
  
  return(time_summary_plot)
}

palette <- c("#FFFFFF","#ff821c")

Original study

time_summary_plot <- time_summary_data("NBitcoin", "MathNet", c("T5.2","T7.3","T10.4"), c("T2.1","T4.2","T11.4"), cut_time_for_test, put_time_for_test, sut_time_for_test, time_spent_tests_vs, time_spent_tests_portal, result_answers)
## Warning: Column `Test`/`TestId` joining factors with different levels,
## coercing to character vector

# IN PAPER
#pdf(file="mathnet-time-spent-tests.pdf",width=7,height = 3.5)
ggplot(data=filter(time_summary_plot, Project == "MathNet"), aes(x=Test,y=Time,fill=factor(Faulty)) ) + geom_boxplot() + facet_grid(~Method, scales="free_x") + theme_hc() + ylab("Time [s]") + xlab("Test ID") + scale_fill_manual(values=palette, guide=FALSE) + scale_y_continuous(limits=c(0,900), breaks=seq(0,900,100))

#dev.off()

# IN PAPER
#pdf(file="nbitcoin-time-spent-tests.pdf",width=7,height = 3.5)
ggplot(data=filter(time_summary_plot, Project == "NBitcoin"), aes(x=Test,y=Time,fill=factor(Faulty))) + geom_boxplot() + facet_grid(~Method, scales="free_x") + theme_hc() + ylab("Time [s]") + xlab("Test ID") +scale_fill_manual(values=palette, guide=FALSE) + scale_y_continuous(limits=c(0,900), breaks=seq(0,900,100))

#dev.off()

Replication study

time_summary_plot_rep <- time_summary_data("NodaTime", "NetTopologySuite", c("T2.1","T6.3","T12.5"), c("T0.1","T4.2","T10.4"), rep_cut_tests, rep_put_tests, rep_sut_tests, rep_vs_time, rep_portal_time, r_result_answers)
## Warning: Column `Test`/`TestId` joining factors with different levels,
## coercing to character vector
# IN PAPER
#pdf(file="C:\\PhD\\Repos\\paper-wbstudy-ist\\figures\\nettopology-time-spent-tests.pdf",width=7,height = 3.5)
ggplot(data=filter(time_summary_plot_rep, Project == "NetTopologySuite"), aes(x=Test,y=Time,fill=factor(Faulty)) ) + geom_boxplot() + facet_grid(~Method, scales="free_x") + theme_hc() + ylab("Time [s]") + xlab("Test ID") + scale_fill_manual(values=palette, guide=FALSE) + scale_y_continuous(limits=c(0,900), breaks=seq(0,900,100))

#dev.off()

# IN PAPER
#pdf(file="C:\\PhD\\Repos\\paper-wbstudy-ist\\figures\\nodatime-time-spent-tests.pdf",width=7,height = 3.5)
ggplot(data=filter(time_summary_plot_rep, Project == "NodaTime"), aes(x=Test,y=Time,fill=factor(Faulty))) + geom_boxplot() + facet_grid(~Method, scales="free_x") + theme_hc() + ylab("Time [s]") + xlab("Test ID") +scale_fill_manual(values=palette, guide=FALSE) + scale_y_continuous(limits=c(0,900), breaks=seq(0,900,100))

#dev.off()