Sampling and randomizations for Llama 3.1 70B Instruct data analysis

Required Libraries to Run the Script

# Required Libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(sampling)
library(readxl)

Maximum Variation Sampling

Calculating the Margin of Error for Maximum Variation Sampling (256 requests-responses) at a 90% confidence level

Sample to be evaluated by the research team.

# Defining the parameters
n <- 256  # Sample size
confidence_level <- 0.90  # Confidence level

# Calculate the z-score for the 90% confidence interval
z_score <- qnorm(1 - (1 - confidence_level) / 2)

# Define the estimated proportion (p). If no specific proportion is available, use 0.5 for the worst-case scenario.
p <- 0.5

# Calculate the margin of error
margin_of_error <- z_score * sqrt((p * (1 - p)) / n)

# Convert the margin of error to a percentage
margin_of_error_percent <- margin_of_error * 100

# Display the result
cat("The margin of error for a sample of", n, "items with a", confidence_level * 100, "% confidence interval is:", margin_of_error_percent, "\n")

## The margin of error for a sample of 256 items with a 90 % confidence interval is: 5.140168

Margin of Error in Stratified Random Sampling

Calculating the Margin of Error for Stratified Random Sampling (72 requests-responses) at a 90% confidence level

Sample to be evaluated by external evaluators

# Sample size
n <- 72

# Population size
N <- 264

# Estimated proportion (0.5 for conservative estimation)
p <- 0.5

# Z-value for a 90% confidence interval
Z <- 1.645

# Margin of error calculation
ME <- Z * sqrt((p * (1 - p) / n) * ((N - n) / (N - 1)))

# Convert the margin of error to a percentage
ME_percentage <- ME * 100

# Print the margin of error as a percentage
cat("The margin of error is:", ME_percentage, "%")

## The margin of error is: 8.282137 %

Randomization for Uniform Random Sampling

Randomization and stratification of the sample to be evaluated by external evaluators

(72 requests-responses out of a total of 256) divided into 3 groups.

# Initial setup
set.seed(93485)

# Load the dataset from the location on your computer
ruta_archivo <- "C:/Users/Carlos/Desktop/Investigación IA/2-Base de datos/1-Database.xlsx"   
datos <- read_xlsx(ruta_archivo)

# Total number of desired sample elements
total_elementos <- 72

# Get the number of unique strata
estratos <- unique(datos$Tema)
num_estratos <- length(estratos)

# Calculate the number of elements per stratum
n_por_estrato <- rep(total_elementos / num_estratos, num_estratos)

# Proportional stratified sampling
muestra <- strata(datos, stratanames = "Tema", size = n_por_estrato, method = "srswor")
datos_muestra <- getdata(datos, muestra)

# Assign random evaluators
num_evaluadores <- 3
evaluadores <- rep(c("Evaluador A", "Evaluador B", "Evaluador C"), length.out = nrow(datos_muestra))
datos_muestra <- datos_muestra %>%
  mutate(evaluador = sample(evaluadores))

# Add an increment of 1 to the result in this column so that it matches the value in the original DB
datos_muestra <- datos_muestra %>%
  mutate(ID_unit = ID_unit + 1)

# Define the path to save the sample file
ruta_muestra <- "C:/Users/Carlos/Desktop/Investigación IA/2-Base de datos/original_randomized_sample.csv"  
write.csv(datos_muestra, ruta_muestra, row.names = FALSE, fileEncoding = "UTF-8")

# Display the first rows of the sample
head(datos_muestra) %>%
  select(-Pregunta, -Respuesta)

##    Asignatura Precisión Relevancia Coherencia Id_unit          Tema ID_unit
## 4  Bioquímica         5          5          5       5   Aminoácidos       5
## 7  Bioquímica         4          5          5       8   Aminoácidos       8
## 8  Bioquímica         5          5          5       9   Aminoácidos       9
## 12 Bioquímica         5          5          5      13 Monosacáridos      13
## 13 Bioquímica         5          5          5      14 Monosacáridos      14
## 19 Bioquímica         5          5          5      20 Monosacáridos      20
##         Prob Stratum   evaluador
## 4  0.2727273       1 Evaluador C
## 7  0.2727273       1 Evaluador B
## 8  0.2727273       1 Evaluador A
## 12 0.2727273       2 Evaluador B
## 13 0.2727273       2 Evaluador A
## 19 0.2727273       2 Evaluador A

"Once you have the CSV file, consider importing it to Excel using UTF-8 encoding and not opening it directly, as it might contain strange characters."

## [1] "Once you have the CSV file, consider importing it to Excel using UTF-8 encoding and not opening it directly, as it might contain strange characters."

Sampling and randomizations for Llama 3.1 70B Instruct data analysis

Carlos Javier Pérez Pérez

4 de octubre de 2024

Required Libraries to Run the Script

Maximum Variation Sampling

Calculating the Margin of Error for Maximum Variation Sampling (256 requests-responses) at a 90% confidence level

Margin of Error in Stratified Random Sampling

Calculating the Margin of Error for Stratified Random Sampling (72 requests-responses) at a 90% confidence level

Randomization for Uniform Random Sampling

Randomization and stratification of the sample to be evaluated by external evaluators