{
  "version": "v4",
  "created_at": "2026-03-04T08:33:04.438649",
  "total_questions": 250,
  "answerable_questions": 200,
  "ood_questions": 50,
  "by_category": {
    "factual_recall": 50,
    "conceptual": 50,
    "technical": 50,
    "cross_document": 30,
    "synthesis": 20,
    "out_of_domain": 50
  },
  "by_min_core": {
    "5": 22,
    "10": 22,
    "20": 44,
    "50": 112
  },
  "questions": [
    {
      "id": "01_sarek_FR1",
      "question": "What is the total storage capacity required for the resulting BAM, annotated VCF, and CNV files for a 90x/90x tumor/normal WGS dataset processed by Sarek?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "01_sarek.pdf"
      ],
      "expected_answer": "Processing a 90x/90x WGS dataset requires approximately 1.4 TB (specifically 1378 GB) for the final result files, excluding temporary data. This storage allocation covers BAM files, annotated VCF files, and CNV files.",
      "expected_concepts": [
        "1.4 TB",
        "1378 GB",
        "90x/90x dataset",
        "BAM files",
        "VCF files"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "01_sarek_CO2",
      "question": "How does the Sarek workflow architecture address the dual needs of computational reproducibility and scientific best practices in variant calling?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "01_sarek.pdf"
      ],
      "expected_answer": "Sarek ensures computational reproducibility and portability by utilizing the Nextflow workflow language alongside containerization (Docker, Singularity) and environment management (Conda). Scientifically, it adheres to GATK best-practice recommendations for pre-processing steps, such as deduplication and recalibration, before performing variant calling using established tools like HaplotypeCaller or Mutect2.",
      "expected_concepts": [
        "Nextflow",
        "Containerization",
        "GATK best-practice",
        "Reproducibility",
        "Portability"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "01_sarek_TE3",
      "question": "Based on the benchmarking results, what is the specific F1 score achieved for somatic single-base mutations (SSM) when using the intersection of GATK4 Mutect2 and Strelka2?",
      "category": "technical",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "01_sarek.pdf"
      ],
      "expected_answer": "Sarek achieves an F1 score of 0.80 for somatic single-base mutations (SSM) when evaluating the intersection of the GATK4 Mutect2 and Strelka2 callers. This result is based on benchmarking against a defined 'Gold Set' as described in the somatic variant calling analysis.",
      "expected_concepts": [
        "F1 score",
        "0.80",
        "Mutect2",
        "Strelka2",
        "Intersection"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "02_snakemake_FR1",
      "question": "According to the performance benchmarks provided in the paper, what are the specific time and memory requirements for the Snakemake scheduler to compute a graph of 90,000 jobs?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "02_snakemake.pdf"
      ],
      "expected_answer": "The Snakemake scheduler requires 37 seconds and 1.1 GB of memory to compute a graph for 90,000 jobs. This performance demonstrates that the scheduler scales linearly as the number of jobs increases.",
      "expected_concepts": [
        "37 seconds",
        "1.1 GB",
        "90,000 jobs",
        "linear scaling"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "02_snakemake_CO2",
      "question": "How does Snakemake's implementation of a blockchain-style hashing scheme facilitate cross-workflow result caching?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "02_snakemake.pdf"
      ],
      "expected_answer": "Snakemake employs a blockchain-style SHA-256 hashing scheme to create unique identifiers for job states based on code, parameters, and input data. This allows the system to recognize identical tasks across different workflow runs or projects, enabling the reuse of cached results and ensuring traceability of the data provenance.",
      "expected_concepts": [
        "SHA-256",
        "hashing",
        "result caching",
        "cross-workflow",
        "provenance"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "02_snakemake_TE3",
      "question": "Explain the technical approach Snakemake uses to optimize job scheduling and how it handles job submission overhead in cluster or cloud environments.",
      "category": "technical",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "02_snakemake.pdf"
      ],
      "expected_answer": "Snakemake uses Mixed Integer Linear Programming (MILP) to solve the scheduling problem by defining an objective function and constraints related to resources and temporary files. To handle overhead in cluster or cloud environments, it utilizes graph partitioning to group multiple rules/jobs into a single submitted unit, which reduces the frequency and cost of job submissions.",
      "expected_concepts": [
        "Mixed Integer Linear Programming (MILP)",
        "objective function",
        "graph partitioning",
        "job grouping",
        "cluster/cloud overhead"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "03_nfcore_framework_FR1",
      "question": "According to the findings, what was the specific reduction in storage usage for the work directory when switching from BAM to CRAM, and what were the corresponding data volumes?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "03_nfcore_framework.pdf"
      ],
      "expected_answer": "Switching from BAM to CRAM format reduced the storage usage for the work directory by 65%. Specifically, the storage volume dropped from 170.4 TB when using BAM to 59.7 TB when using CRAM.",
      "expected_concepts": [
        "65% reduction",
        "CRAM",
        "170.4 TB",
        "59.7 TB",
        "work directory"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "03_nfcore_framework_CO2",
      "question": "How does the nf-core/sarek 3.1.1 pipeline leverage architectural and methodological changes to optimize cost and performance for large-scale data analysis?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "03_nfcore_framework.pdf"
      ],
      "expected_answer": "The pipeline achieves significant optimization by transitioning to the Nextflow DSL2 framework, which allows for modular design and intra-sample parallelization through FastQ sharding. By combining these parallelization techniques with a switch from BAM to CRAM internal file formats and the integration of modern tools like DeepVariant, the system realizes a 70% reduction in commercial cloud costs and up to an 84% reduction in runtime for single samples.",
      "expected_concepts": [
        "Nextflow DSL2",
        "intra-sample parallelization",
        "sharding",
        "CRAM",
        "70% cost reduction"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "03_nfcore_framework_TE3",
      "question": "Based on the benchmarking of mapping processes, what was the numerical impact of input sharding on median runtime, and at what point was the maximum efficiency plateau reached?",
      "category": "technical",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "03_nfcore_framework.pdf"
      ],
      "expected_answer": "Sharding the input files for intra-sample parallelization reduced the median runtime of the mapping processes to 37% of the original time. The benchmarking results show that this reduction in runtime reaches a performance plateau when the input is divided into 12 shards.",
      "expected_concepts": [
        "sharding",
        "median runtime",
        "37%",
        "12 shards",
        "plateau"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "04_fastp_FR1",
      "question": "In a speed evaluation on a 9,316 M base dataset, what were the specific processing times for fastp (PE) compared to Trimmomatic?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "04_fastp.pdf"
      ],
      "expected_answer": "In the evaluation, fastp (PE) took 13.3 minutes to process the dataset. In contrast, Trimmomatic required 60.9 minutes, making fastp approximately 4.5 times faster in this specific comparison.",
      "expected_concepts": [
        "13.3 minutes",
        "60.9 minutes",
        "fastp (PE)",
        "Trimmomatic",
        "speed evaluation"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "04_fastp_CO2",
      "question": "What design choices and implementation strategies allow fastp to achieve higher performance and lower I/O overhead than traditional FASTQ tools?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "04_fastp.pdf"
      ],
      "expected_answer": "fastp achieves high performance by being implemented in C++ and utilizing multi-threading via a thread pool with data packing (N=1000 reads per pack). To minimize I/O overhead, it employs a single-scan processing architecture, which allows it to perform multiple QC and filtering tasks in one pass rather than reading and writing data multiple times.",
      "expected_concepts": [
        "C++ implementation",
        "multi-threading",
        "thread pool",
        "single-scan processing",
        "I/O overhead",
        "data packing"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "04_fastp_TE3",
      "question": "How does fastp's methodology for adapter detection differ between single-end and paired-end sequencing data?",
      "category": "technical",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "04_fastp.pdf"
      ],
      "expected_answer": "For single-end data, fastp utilizes a K-mer based automatic adapter detection method. For paired-end data, it uses an overlap-analysis based approach, which not only detects adapters but also enables base correction in the overlapping regions of the reads.",
      "expected_concepts": [
        "K-mer based",
        "automatic adapter detection",
        "single-end",
        "overlap-analysis",
        "paired-end",
        "base correction"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "05_multiqc_FR1",
      "question": "According to the paper, how many common bioinformatics tools does MultiQC support at the time of writing?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "05_multiqc.pdf"
      ],
      "expected_answer": "MultiQC successfully aggregates output from 22 common bioinformatics tools. This includes a variety of tools such as aligners, processing tools, and quality control (QC) programs.",
      "expected_concepts": [
        "22",
        "bioinformatics tools",
        "aligners",
        "processing tools",
        "QC programs"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "05_multiqc_CO2",
      "question": "How does MultiQC's approach to data presentation assist researchers in identifying quality issues across large-scale projects?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "05_multiqc.pdf"
      ],
      "expected_answer": "By aggregating data from multiple samples into a single report with shared plots, MultiQC allows for a fast scan of key statistics across an entire project. This centralized view facilitates the identification of batch effects and outliers through accurate comparison between samples, revealing subtle differences that might be missed in individual reports.",
      "expected_concepts": [
        "batch effects",
        "outliers",
        "shared plots",
        "sample comparison",
        "centralized report"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "05_multiqc_TE3",
      "question": "Which specific software libraries and architectural mechanisms does MultiQC use to handle data visualization and support custom extensions?",
      "category": "technical",
      "subcategory": null,
      "min_core": 5,
      "source_files": [
        "05_multiqc.pdf"
      ],
      "expected_answer": "MultiQC utilizes the Jinja2 template engine for rendering HTML reports, with HighCharts for interactive JavaScript visualizations and MatPlotLib for static plots in large datasets. For extensibility, it employs a plugin system using Python setuptools entry points to allow for custom submodules and extensions.",
      "expected_concepts": [
        "HighCharts",
        "MatPlotLib",
        "Jinja2",
        "setuptools entry points",
        "plugin system"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "06_star_aligner_FR1",
      "question": "What is the reported mapping speed of STAR when using a 12-core server for 2 x 76 bp paired-end reads, and how does this compare to other aligners?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "06_star_aligner.pdf"
      ],
      "expected_answer": "STAR can align 550 million 2 x 76 bp paired-end reads per hour to the human genome on a 12-core server. This performance makes STAR approximately 50 times faster than other existing RNA-seq aligners available at the time of the study.",
      "expected_concepts": [
        "550 million reads per hour",
        "12-core server",
        "50 times faster",
        "paired-end reads"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "06_star_aligner_CO2",
      "question": "How does the Maximum Mappable Prefix (MMP) search mechanism facilitate the identification of diverse sequence features beyond simple genomic mapping?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "06_star_aligner.pdf"
      ],
      "expected_answer": "The MMP search identifies the longest prefix of a read that matches the reference genome perfectly. By sequentially applying this search to the unmapped portions of a read, STAR can discovery diverse features such as splice junctions (canonical and non-canonical), mismatches, and sequence tails like poly-A stretches or adapters, as well as chimeric/fusion junctions.",
      "expected_concepts": [
        "Sequential search",
        "splice junctions",
        "mismatches",
        "sequence tails",
        "chimeric junctions"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "06_star_aligner_TE3",
      "question": "According to the experimental benchmarks using ENCODE datasets, what percentage of reads did STAR successfully align, and which other aligner achieved a similar percentage?",
      "category": "technical",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "06_star_aligner.pdf"
      ],
      "expected_answer": "In the experimental ENCODE datasets, STAR aligned 94% of the reads. This percentage was the highest among the tested tools and was matched only by the GSNAP aligner, while other tools like RUM, MapSplice, and TopHat2 aligned significantly fewer reads (86%, 85%, and 71% respectively).",
      "expected_concepts": [
        "94%",
        "ENCODE datasets",
        "GSNAP",
        "TopHat2",
        "alignment percentage"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "07_salmon_FR1",
      "question": "By what percentage range was Salmon's sensitivity higher in differential expression testing compared to other tools at the same False Discovery Rates (FDRs)?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "07_salmon.pdf"
      ],
      "expected_answer": "Salmon exhibited significantly higher sensitivity in differential expression testing, showing a range of 53% to 450% higher sensitivity compared to other tools at the same False Discovery Rates.",
      "expected_concepts": [
        "sensitivity",
        "False Discovery Rates",
        "FDR",
        "differential expression testing"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "07_salmon_CO2",
      "question": "How does Salmon's method of bias correction specifically improve the outcomes of differential expression (DE) analysis?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "07_salmon.pdf"
      ],
      "expected_answer": "Salmon is the first transcriptome-wide quantifier to correct for fragment GC content bias, alongside sequence-specific and positional biases. By modeling these technical artifacts, Salmon reduces false-positive DE calls (decreasing them by a factor of ~2.6 in specific GEUVADIS comparisons) and improves the overall accuracy and reliability of the analysis.",
      "expected_concepts": [
        "fragment GC content bias",
        "technical artifacts",
        "false-positive",
        "differential expression analysis",
        "reliability"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "07_salmon_TE3",
      "question": "Describe the algorithmic components and optimization techniques Salmon uses to achieve speed comparable to ultra-fast alignment-free tools while maintaining accuracy.",
      "category": "technical",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "07_salmon.pdf"
      ],
      "expected_answer": "Salmon utilizes a dual-phase parallel inference algorithm consisting of online and offline phases, combined with lightweight-mapping (quasi-mapping) for ultra-fast read mapping. For quantification, it employs variational Bayesian (VB) and Expectation-Maximization (EM) optimization, specifically utilizing stochastic, collapsed variational Bayesian inference and rich equivalence classes.",
      "expected_concepts": [
        "dual-phase parallel inference",
        "lightweight-mapping",
        "variational Bayesian",
        "Expectation-Maximization",
        "rich equivalence classes"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "08_deseq2_FR1",
      "question": "In the reproducibility analysis where a dataset was split into two independent halves, what specific improvement in the overlap of the top 100 genes was observed when using shrunken MAP estimates compared to standard MLE estimates?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "08_deseq2.pdf"
      ],
      "expected_answer": "The overlap of the top 100 genes ranked by fold change improved from 21% using standard MLE estimates to 81% when using shrunken MAP estimates. This demonstrates that shrunken LFC estimates are significantly more reproducible than standard estimates.",
      "expected_concepts": [
        "MAP estimates",
        "MLE estimates",
        "reproducibility",
        "fold change",
        "overlap percentage"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "08_deseq2_CO2",
      "question": "How does the DESeq2 framework utilize Empirical Bayes shrinkage to handle the high uncertainty associated with low-count genes in both dispersion and fold change estimation?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "08_deseq2.pdf"
      ],
      "expected_answer": "DESeq2 applies Empirical Bayes shrinkage in two distinct steps: first, gene-wise dispersion estimates are shrunk toward a mean-dependent trend line to stabilize variance. Second, logarithmic fold change (LFC) estimates are shrunk toward zero using a zero-centered normal prior (MAP estimation), which effectively filters out noisy, high-fold-change estimates caused by low-count genes and improves the ranking of genes for downstream analysis.",
      "expected_concepts": [
        "Empirical Bayes shrinkage",
        "dispersion estimation",
        "MAP estimation",
        "low-count genes",
        "logarithmic fold change",
        "trend line"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "08_deseq2_TE3",
      "question": "What is the specific purpose of the regularized logarithm (rlog) transformation, and how does its performance compare to an ordinary log transformation in terms of variance stabilization?",
      "category": "technical",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "08_deseq2.pdf"
      ],
      "expected_answer": "The rlog transformation is designed for variance stabilization across the entire dynamic range of count data. Unlike the ordinary log transformation, where gene-wise standard deviation remains highly variable and dependent on the mean, rlog ensures that the standard deviation remains stable across the range of mean counts, which is particularly beneficial for sample clustering and distance visualizations.",
      "expected_concepts": [
        "regularized logarithm (rlog)",
        "variance stabilization",
        "standard deviation",
        "mean expression",
        "dynamic range",
        "clustering"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "09_seqkit_FR1",
      "question": "Which operating systems are supported by SeqKit and what are the requirements for their installation and configuration?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "09_seqkit.pdf"
      ],
      "expected_answer": "SeqKit provides executable binary files for all major operating systems, specifically Windows, Linux, and Mac OSX. These binaries are designed to be used directly without the need for any external dependencies or pre-configurations.",
      "expected_concepts": [
        "Windows",
        "Linux",
        "Mac OSX",
        "executable binary",
        "dependencies"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "09_seqkit_CO2",
      "question": "How does SeqKit achieve stable memory usage and efficiency when processing datasets that are significantly larger than the human genome?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "09_seqkit.pdf"
      ],
      "expected_answer": "SeqKit maintains a stable peak memory usage of approximately 750-780 Mb by implementing a two-pass file reading mode and using indices instead of loading entire files into memory. Additionally, it utilizes MD5 digests to represent sequence content, which allows for memory-efficient deduplication even when the input data volume increases up to 32 times the human genome size.",
      "expected_concepts": [
        "two-pass mode",
        "MD5 digests",
        "deduplication",
        "peak memory",
        "indices"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "09_seqkit_TE3",
      "question": "What specific technical optimization was applied to SeqKit's reverse complementation process, and what was the measured performance gain compared to using standard hash maps?",
      "category": "technical",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "09_seqkit.pdf"
      ],
      "expected_answer": "The authors optimized the reverse complementation process by using a slice data structure and ASCII code indexing. This approach replaced the standard use of hash maps, resulting in an approximate 20-fold (20x) increase in execution speed.",
      "expected_concepts": [
        "reverse complementation",
        "slice data structure",
        "ASCII code indexing",
        "hash maps",
        "20x speedup"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "10_cutadapt_FR1",
      "question": "What unique capability does Cutadapt offer compared to other standalone tools in the context of read trimming?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "10_cutadapt.pdf"
      ],
      "expected_answer": "Cutadapt is the only standalone tool that is capable of correctly trimming color-space reads (SOLiD data). This distinguishes it from other tools that may require integrated pipelines or lack support for double encoding.",
      "expected_concepts": [
        "standalone tool",
        "color-space",
        "trimming",
        "SOLiD"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "10_cutadapt_CO2",
      "question": "How does the implementation of Cutadapt balance the trade-off between user accessibility and computational efficiency?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "10_cutadapt.pdf"
      ],
      "expected_answer": "Cutadapt is primarily implemented in Python to ensure ease of use and flexibility, but it utilizes C extensions for the core alignment tasks to achieve high speed. Despite these optimizations, the overall performance is often limited by input/output operations, such as file parsing, which can consume more than half of the total processing time.",
      "expected_concepts": [
        "Python implementation",
        "C extensions",
        "alignment speed",
        "I/O bottleneck",
        "parsing"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "10_cutadapt_TE3",
      "question": "What are the specific performance metrics for Cutadapt when processing 35 bp colour-space reads with an 18 bp adapter on a 2.66 GHz Intel Core 2 processor?",
      "category": "technical",
      "subcategory": null,
      "min_core": 10,
      "source_files": [
        "10_cutadapt.pdf"
      ],
      "expected_answer": "Under these specific hardware and sequence parameters, Cutadapt can process approximately 1 million reads per minute. This performance corresponds to a per-read trimming speed of approximately 0.06 ms on a single processor core.",
      "expected_concepts": [
        "1 million reads per minute",
        "0.06 ms per read",
        "single core",
        "2.66 GHz Intel Core 2"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "11_asf_burkina_faso_FR1",
      "question": "In the study of the 2018 African Swine Fever Virus (ASFV) outbreaks, how many blood samples were collected in total and how many of those were confirmed positive for ASFV DNA?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "11_asf_burkina_faso.pdf"
      ],
      "expected_answer": "According to the study, a total of 62 blood samples were collected from pigs during the field investigations. Out of these sixty-two samples, real-time PCR results confirmed the presence of ASFV DNA in 20 samples.",
      "expected_concepts": [
        "62 blood samples",
        "20 positive",
        "real-time PCR",
        "ASFV DNA"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "11_asf_burkina_faso_CO2",
      "question": "How did the researchers use multi-gene analysis to establish the genetic identity and classification of the ASFV strains responsible for the outbreaks?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "11_asf_burkina_faso.pdf"
      ],
      "expected_answer": "The researchers combined sequencing and phylogenetic analysis of several genes to classify the isolates. They used the partial p72 (B646L) and complete p54 (E183L) genes to identify the virus as Genotype I (specifically Genotype Ia), while the analysis of partial CD2v amino acid sequences allowed them to cluster the isolates into Serogroup 4. This multi-locus approach provided a higher resolution of the virus's identity than single-gene sequencing alone.",
      "expected_concepts": [
        "Genotype I",
        "Serogroup 4",
        "p72",
        "p54",
        "CD2v",
        "phylogenetic analysis"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "11_asf_burkina_faso_TE3",
      "question": "What specific software and phylogenetic methods were utilized for the analysis of the ASFV sequences, and what variation was found in the Central Variable Region (CVR)?",
      "category": "technical",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "11_asf_burkina_faso.pdf"
      ],
      "expected_answer": "The study utilized MEGA 7 software to perform phylogenetic analysis using Neighbor-Joining (NJ), Minimum Evolution (ME), and Maximum Likelihood (ML) methods. Analysis of the B602L gene's Central Variable Region (CVR) revealed four distinct variants of the Tetrameric Repeat Sequence (TRS), containing 32, 24, 23, and 12 repeat units respectively.",
      "expected_concepts": [
        "MEGA 7",
        "Neighbor-Joining",
        "Minimum Evolution",
        "Maximum Likelihood",
        "Central Variable Region",
        "Tetrameric Repeat Sequence"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "12_hpai_netherlands_FR1",
      "question": "What was the specific percentage increase in the distribution of HPAI outbreaks in meat-type poultry when comparing the 2014\u20132018 period to the 2020\u20132022 period?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "12_hpai_netherlands.pdf"
      ],
      "expected_answer": "The distribution of HPAI outbreaks in meat-type poultry increased from 6.3% in the 2014\u20132018 period to 39.5% in the 2020\u20132022 period.",
      "expected_concepts": [
        "meat-type poultry",
        "6.3%",
        "39.5%",
        "2014\u20132018",
        "2020\u20132022"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "12_hpai_netherlands_CO2",
      "question": "How do the clinical presentations and mortality detection thresholds of HPAI differ between chicken populations (Galliformes) and duck populations (Anseriformes) according to the study?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "12_hpai_netherlands.pdf"
      ],
      "expected_answer": "Ducks are more likely to exhibit nervous/locomotor (66.7%) and reproductive signs (100%), whereas chickens primarily show mucosal/skin signs (55.0\u201367.8%). Additionally, chickens are more easily detected via mortality monitoring, with over 80% reaching a Mortality Ratio (MR) > 3 before notification, compared to only 53.8% of duck flocks.",
      "expected_concepts": [
        "nervous/locomotor signs",
        "reproductive tract signs",
        "mucosal/skin signs",
        "Mortality Ratio (MR)",
        "Galliformes",
        "Anseriformes"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "12_hpai_netherlands_TE3",
      "question": "How did the researchers define and calculate the Mortality Ratio (MR) metric used to assess early warning signals on the infected farms?",
      "category": "technical",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "12_hpai_netherlands.pdf"
      ],
      "expected_answer": "The Mortality Ratio (MR) was calculated by comparing the mortality recorded on the day before notification to the average mortality recorded during the previous week. A threshold of MR > 3 was specifically used to identify farms that reached an early warning level of increased mortality.",
      "expected_concepts": [
        "day-before-notification mortality",
        "average of the previous week",
        "Mortality Ratio",
        "MR > 3",
        "early warning threshold"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "13_lsd_nepal_FR1",
      "question": "What were the overall morbidity, mortality, and case fatality rates recorded during the LSD outbreak in the study area?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "13_lsd_nepal.pdf"
      ],
      "expected_answer": "The overall morbidity rate of LSD in the study area was 28.02%, while the mortality rate was 3.06%. Additionally, the case fatality rate (CFR) for the outbreak was calculated to be 10.90%, based on 47 deaths out of 431 sick animals.",
      "expected_concepts": [
        "morbidity rate 28.02%",
        "mortality rate 3.06%",
        "case fatality rate",
        "10.90%",
        "Nawalpur district"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "13_lsd_nepal_CO2",
      "question": "Based on the epidemiological data, which specific cattle group was most affected by the outbreak and what were the primary clinical manifestations observed?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "13_lsd_nepal.pdf"
      ],
      "expected_answer": "Dry cattle, including pregnant cows and heifers, were the most affected group with a morbidity rate of 47.62%, which was higher than that of milking cattle. The outbreak was primarily characterized by skin nodules, found in 97% of case farms, followed by a significant decrease in milk production and lameness.",
      "expected_concepts": [
        "dry cattle",
        "morbidity",
        "skin nodules",
        "milk production",
        "heifers",
        "lameness"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "13_lsd_nepal_TE3",
      "question": "According to the multivariable logistic regression analysis, what was identified as the most significant risk factor for LSD transmission and what were its associated statistical metrics?",
      "category": "technical",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "13_lsd_nepal.pdf"
      ],
      "expected_answer": "The multivariable analysis identified the presence of clinical signs in neighboring farms as the most significant risk factor for the spread of LSD. This variable showed an Odds Ratio (OR) of 88.45, with a 95% Confidence Interval of 13.27\u2013589.36 and a statistical significance of p < 0.001.",
      "expected_concepts": [
        "multivariable logistic regression",
        "Odds Ratio",
        "neighboring farms",
        "88.45",
        "95% CI",
        "p-value"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "14_bovine_tb_cameroon_FR1",
      "question": "What were the specific bovine tuberculosis (bTB) prevalence estimates calculated in this study based on the different diagnostic tests and thresholds used?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "14_bovine_tb_cameroon.pdf"
      ],
      "expected_answer": "The study estimated the prevalence of bTB as 6.8% (95% CI: 4.35%\u20139.41%) for the Comparative Intradermal Tuberculin Test (CIDT) at a 3 mm threshold, 0.6% (95% CI: 0%\u20131.2%) for the CIDT at a 4 mm threshold, and 1.8% (95% CI: 0%\u20133.6%) for the Simple Intradermal Tuberculin Test (SITT) at a 4 mm threshold.",
      "expected_concepts": [
        "CIDT",
        "SITT",
        "threshold",
        "prevalence",
        "3 mm",
        "4 mm"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "14_bovine_tb_cameroon_CO2",
      "question": "How does the choice of diagnostic method and skinfold thickness threshold impact the assessment of bTB exposure according to the study's findings?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "14_bovine_tb_cameroon.pdf"
      ],
      "expected_answer": "The prevalence of bTB is highly sensitive to both the type of test (CIDT vs. SITT) and the specific millimeter threshold applied to the skinfold reaction. The results show that using a more stringent 4 mm threshold for CIDT results in a much lower prevalence (0.6%) compared to a 3 mm threshold (6.8%), indicating that the diagnostic interpretation grid significantly shifts the perceived disease burden within the same cattle population.",
      "expected_concepts": [
        "diagnostic test",
        "skinfold thickness",
        "interpretation grid",
        "sensitivity",
        "threshold variation"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "14_bovine_tb_cameroon_TE3",
      "question": "Which specific statistical software and metrics were employed to refine the logistic regression models and verify the independence of the identified risk factors?",
      "category": "technical",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "14_bovine_tb_cameroon.pdf"
      ],
      "expected_answer": "The researchers used R software for univariate and multivariate logistic regression modeling. To ensure the quality and proper selection of the models, the Akaike Information Criterion (AIC) was utilized, and the Variance Inflation Factor (VIF) was calculated to test for multi-collinearity among the variables in the final model.",
      "expected_concepts": [
        "R software",
        "Akaike Information Criterion (AIC)",
        "Variance Inflation Factor (VIF)",
        "multi-collinearity",
        "logistic regression"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "15_rabies_tanzania_FR1",
      "question": "How many human rabies deaths were recorded during the study period, and what was observed regarding their treatment status?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "15_rabies_tanzania.pdf"
      ],
      "expected_answer": "A total of 48 human rabies deaths were identified over the 2002-2022 study period, with the number of deaths peaking in 2011. Notably, none of the victims of these recorded deaths had completed a full course of post-exposure vaccinations.",
      "expected_concepts": [
        "48 human rabies deaths",
        "post-exposure vaccinations",
        "2011 peak",
        "Serengeti District"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "15_rabies_tanzania_CO2",
      "question": "Explain how spatial heterogeneity in vaccination coverage impacts the persistence of rabies even when average coverage targets are met.",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "15_rabies_tanzania.pdf"
      ],
      "expected_answer": "Spatial heterogeneity in vaccination coverage acts as a significant driver of rabies persistence because uneven distribution among villages creates pockets of high susceptibility. The study's power mean susceptibility models demonstrated that for the same average coverage, high heterogeneity among non-focal villages resulted in an effective focal incidence 2.6 times greater than a scenario with homogeneous coverage.",
      "expected_concepts": [
        "spatial heterogeneity",
        "power mean susceptibility",
        "focal incidence",
        "homogeneous distribution",
        "rabies persistence"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "15_rabies_tanzania_TE3",
      "question": "What methodological approach and software tool were used to distinguish between local rabies transmission and disease incursions, and how did the proportion of incursions change by 2022?",
      "category": "technical",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "15_rabies_tanzania.pdf"
      ],
      "expected_answer": "The study utilized transmission-tree reconstruction using the 'treerabid' R package to identify rabies incursions from outside the district. The analysis found that as local transmission was interrupted, the proportion of cases attributed to incursions rose significantly, reaching 50% of all recorded cases in 2022, compared to only 3% prior to 2018.",
      "expected_concepts": [
        "treerabid R package",
        "transmission-tree reconstruction",
        "disease incursions",
        "local transmission",
        "50% of cases"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "16_ppr_ethiopia_FR1",
      "question": "How many total PPR outbreaks were recorded in the Borena Zone between 2018 and 2022, and which specific month saw the highest percentage of these outbreaks?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "16_ppr_ethiopia.pdf"
      ],
      "expected_answer": "According to the retrospective analysis of the DOVAR-II system, a total of 53 PPR outbreaks occurred in the Borena Zone between 2018 and 2022. The seasonal peak for these outbreaks was in January, which accounted for 20.8% of the total recorded cases.",
      "expected_concepts": [
        "53 outbreaks",
        "Borena Zone",
        "2018-2022",
        "January",
        "20.8%"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "16_ppr_ethiopia_CO2",
      "question": "Why does the study conclude that the current PPR herd immunity in the Borena Zone is insufficient according to international standards?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "16_ppr_ethiopia.pdf"
      ],
      "expected_answer": "The study found that the herd immunity level in the region was 68.8% among vaccinated animals. This is considered insufficient because it falls below the 80% threshold recommended by the FAO and WOAH (World Organisation for Animal Health) to effectively control and eliminate the disease within a population.",
      "expected_concepts": [
        "68.8%",
        "80% threshold",
        "FAO-WOAH",
        "herd immunity",
        "vaccination coverage"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "16_ppr_ethiopia_TE3",
      "question": "Using the multivariable logistic regression results, how do animal age and origin (gifted or purchased) quantify the risk of PPR seropositivity in nonvaccinated animals?",
      "category": "technical",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "16_ppr_ethiopia.pdf"
      ],
      "expected_answer": "The multivariable logistic regression showed that older animals were 7.3 times more likely to be seropositive (OR: 7.3; 95%CI: 2.7\u201319.4) compared to adult animals. Furthermore, animal origin was a significant risk factor, with gifted animals being 8.3 times more likely (OR: 8.3) and purchased animals being 4 times more likely (OR: 4) to be seropositive than those born within the flock.",
      "expected_concepts": [
        "multivariable logistic regression",
        "odds ratio",
        "older animals",
        "gifted animals",
        "purchased animals",
        "OR: 7.3"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "17_brucellosis_ethiopia_FR1",
      "question": "What specific districts in the Somali Region of Ethiopia were included as study areas for this research?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "17_brucellosis_ethiopia.pdf"
      ],
      "expected_answer": "The study was conducted in three specific districts within the Somali Region of Ethiopia: Goro Baqaqsa, Guradamole, and Dolo Ado. These locations are represented in the study's map of the study areas.",
      "expected_concepts": [
        "Somali Region",
        "Goro Baqaqsa",
        "Guradamole",
        "Dolo Ado",
        "Ethiopia"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "17_brucellosis_ethiopia_CO2",
      "question": "Based on the study's findings, how does the clinical history of livestock relate to the likelihood of brucellosis seropositivity in both small ruminants and camels?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "17_brucellosis_ethiopia.pdf"
      ],
      "expected_answer": "In both species, a history of reproductive health issues, specifically Retained Fetal Membranes (RFM), is a major predictor of seropositivity. Camels with a history of RFM showed the strongest association with the disease (OR: 35), while small ruminants with the same history were 9 times more likely to test positive compared to those without.",
      "expected_concepts": [
        "Retained fetal membranes (RFM)",
        "seropositivity",
        "risk factors",
        "small ruminants",
        "camels",
        "Odds Ratio (OR)"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "17_brucellosis_ethiopia_TE3",
      "question": "Describe the laboratory diagnostic sequence and the statistical methods used to identify significant risk factors for brucellosis in this study.",
      "category": "technical",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "17_brucellosis_ethiopia.pdf"
      ],
      "expected_answer": "The researchers employed a two-step diagnostic approach, starting with the Rose Bengal Plate Test (RBPT) for initial screening of serum samples, followed by a competitive enzyme-linked immunosorbent assay (cELISA) for confirmation. Statistical analysis was performed using R software, utilizing both univariable and multivariable logistic regression to calculate odds ratios and determine p-values for risk factors.",
      "expected_concepts": [
        "Rose Bengal Plate Test (RBPT)",
        "cELISA",
        "multivariable logistic regression",
        "R software",
        "screening",
        "serum samples"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "18_fmd_review_FR1",
      "question": "According to the paper, what are the primary sites of FMDV infection in ruminants and pigs?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "18_fmd_review.pdf"
      ],
      "expected_answer": "The primary site of FMDV infection in ruminants is the nasopharynx (specifically the nasopharyngeal mucosa). In pigs, the primary infection site is the oropharynx, typically involving the oropharyngeal tonsils.",
      "expected_concepts": [
        "nasopharynx",
        "oropharynx",
        "ruminants",
        "pigs",
        "primary site of infection"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "18_fmd_review_CO2",
      "question": "How does the FMDV 'carrier state' differ between pigs and ruminants, and what biological evidence challenges the historical 28-day threshold for this state?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "18_fmd_review.pdf"
      ],
      "expected_answer": "Pigs do not develop a persistent carrier state and efficiently clear FMDV from all tissues following recovery, whereas ruminants can maintain the virus long-term. The historical 28-day threshold for defining this state is considered biologically arbitrary because it was based on fixed study durations; current research shows the biological transition to a persistent state actually occurs between 7-21 days post-infection.",
      "expected_concepts": [
        "carrier state",
        "persistence",
        "28-day threshold",
        "ruminants",
        "viral clearance",
        "7-21 days"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "18_fmd_review_TE3",
      "question": "What are the specific quantitative estimates for preclinical transmission (theta) in pigs and cattle as identified in the research synthesis?",
      "category": "technical",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "18_fmd_review.pdf"
      ],
      "expected_answer": "The proportion of preclinical transmission, represented as theta, is estimated at 0.12 (12%) for pigs. In cattle, this value is significantly higher, ranging up to 0.44 (44%) when using proxy measures of viral presence.",
      "expected_concepts": [
        "preclinical transmission",
        "theta",
        "0.12",
        "0.44",
        "proxy measures"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "19_hpai_canada_FR1",
      "question": "According to the epidemiological review of the 2022 H5N1 outbreak, how many birds were culled in Canada?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "19_hpai_canada.pdf"
      ],
      "expected_answer": "In 2022, Canada experienced a massive wave of HPAI H5N1 infections that resulted in the culling of over 7 million birds. These outbreaks have continued to profoundly affect commercial bird farms both in Canada and across the world.",
      "expected_concepts": [
        "7 million",
        "birds",
        "culled",
        "Canada",
        "HPAI H5N1"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "19_hpai_canada_CO2",
      "question": "What are the common clinical presentations and relative frequency of H5N1 clade 2.3.4.4b infections when comparing mammals to humans?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "19_hpai_canada.pdf"
      ],
      "expected_answer": "Infected mammals, including species like skunks and foxes, frequently present with severe neurological signs such as encephalitis or meningoencephalitis which often lead to death. Conversely, human infection with this specific clade remains extremely rare, with fewer than 10 cases reported globally despite the widespread mammalian spillover.",
      "expected_concepts": [
        "neurological signs",
        "encephalitis",
        "spillover",
        "human infection",
        "rare",
        "clade 2.3.4.4b"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "19_hpai_canada_TE3",
      "question": "Which specific viral mutations linked to mammalian adaptation were detected in the neurological tissues of carnivores in Canada and Europe?",
      "category": "technical",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "19_hpai_canada.pdf"
      ],
      "expected_answer": "The mutations identified in the brains of various carnivores include PB2-E627K, PB2-E627V, and D701N. These specific genetic markers are recognized as mammalian adaptation mutations and were found in samples collected from both European and Canadian animals.",
      "expected_concepts": [
        "PB2-E627K",
        "E627V",
        "D701N",
        "mammalian adaptation",
        "brain",
        "carnivores"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "20_lsd_review_FR1",
      "question": "What is the reported efficacy of the homologous vaccine Lumpi-ProVacInd developed in India?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "20_lsd_review.pdf"
      ],
      "expected_answer": "The homologous vaccine Lumpi-ProVacInd, which was developed by scientists at the Indian Council of Agricultural Research (ICAR), has an efficacy rate of 100%.",
      "expected_concepts": [
        "Lumpi-ProVacInd",
        "ICAR",
        "100%",
        "homologous vaccine"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "20_lsd_review_CO2",
      "question": "How does the clinical impact of Lumpy Skin Disease Virus (LSDV) differ in terms of its spread within a herd versus its lethality?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "20_lsd_review.pdf"
      ],
      "expected_answer": "LSDV is characterized by extremely high morbidity but relatively low mortality. While the illness can spread to up to 100% of a cattle herd (high morbidity), the mortality rate typically remains below 10%. The impact is primarily seen through widespread clinical symptoms like fever, skin nodules, and decreased milk production rather than high death rates.",
      "expected_concepts": [
        "high morbidity",
        "low mortality",
        "100% morbidity",
        "10% mortality",
        "clinical symptoms"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "20_lsd_review_TE3",
      "question": "According to the comparative analysis of vaccinations, what is the specific cross-protection rate between LSDV and related poxviruses, and which viruses are involved?",
      "category": "technical",
      "subcategory": null,
      "min_core": 20,
      "source_files": [
        "20_lsd_review.pdf"
      ],
      "expected_answer": "LSDV shares a high cross-protection rate of approximately 96% when vaccinations are administered using the Sheep Pox Virus and the Goat Pox Virus. This high rate of protection is a key factor in using these related virus strains for immunization strategies.",
      "expected_concepts": [
        "96% cross-protection",
        "Sheep Pox Virus",
        "Goat Pox Virus",
        "vaccination efficacy"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "21_livestock_gut_microbiome_review_FR1",
      "question": "In cattle, which two bacterial phyla are the most abundant and what percentage of the gastrointestinal tract bacterial community can they represent?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "21_livestock_gut_microbiome_review.pdf"
      ],
      "expected_answer": "In cattle, the most abundant bacterial phyla are Bacteroidetes and Firmicutes. Together, these two phyla can account for more than 90% of the entire gastrointestinal tract (GIT) bacterial community.",
      "expected_concepts": [
        "Bacteroidetes",
        "Firmicutes",
        "90%",
        "gastrointestinal tract",
        "bacterial phyla"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "21_livestock_gut_microbiome_review_CO2",
      "question": "What are the primary biases currently existing in livestock gut microbiome research regarding breed selection and taxonomic focus?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "21_livestock_gut_microbiome_review.pdf"
      ],
      "expected_answer": "Research is currently biased toward a small number of globally distributed, highly selected cosmopolitan breeds, which leads to an oversight of local breeds adapted to harsh environments. Furthermore, there is a significant taxonomic bias where the bulk of research focuses almost exclusively on bacteria, leaving other components like protists, fungi, and viruses under-studied.",
      "expected_concepts": [
        "cosmopolitan breeds",
        "local breeds",
        "taxonomic bias",
        "non-bacterial components",
        "protists",
        "fungi"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "21_livestock_gut_microbiome_review_TE3",
      "question": "Which specific bioinformatics tools and molecular sequencing techniques are utilized in the analysis of the livestock microbiome as described in the paper's methods?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "21_livestock_gut_microbiome_review.pdf"
      ],
      "expected_answer": "The study utilizes molecular techniques including Amplicon Metabarcoding (targeting 16S, 18S, and ITS rRNA), Shotgun Metagenomic Sequencing, and Metatranscriptomics (RNASeq). For data processing and functional prediction, bioinformatics tools such as QIIME, PICRUSt, and Tax4Fun are employed.",
      "expected_concepts": [
        "Amplicon Metabarcoding",
        "Shotgun Metagenomic Sequencing",
        "QIIME",
        "PICRUSt",
        "Tax4Fun",
        "Metatranscriptomics"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "22_pig_mags_FR1",
      "question": "What percentage of the recovered metagenome-assembled genomes (MAGs) were assigned to species without cultured representatives, and what does this suggest about the swine gut microbiome?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "22_pig_mags.pdf"
      ],
      "expected_answer": "82% of the MAGs were assigned to species that lack cultured representatives. This finding indicates that a large portion of the swine gut microbiome remains poorly characterized and potentially contains many novel species.",
      "expected_concepts": [
        "82%",
        "uncultured species",
        "swine gut microbiome",
        "poorly characterized",
        "MAGs"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "22_pig_mags_CO2",
      "question": "Based on the study's findings, how does the transition from nursing to weaning impact the taxonomic composition and relative abundance of specific bacterial groups in pigs?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "22_pig_mags.pdf"
      ],
      "expected_answer": "Weaning is associated with a significant shift in the microbiome, characterized by a decrease in the relative abundance of 69 MAGs, such as E. coli, and an increase in 140 MAGs, including Clostridium sp000435835. This transition is further visualized in a phylogenetic tree (Figure 1) which maps these changes alongside taxonomic classification and CAZyme counts to show how the community structure adapts to post-weaning conditions.",
      "expected_concepts": [
        "weaning",
        "relative abundance",
        "E. coli",
        "Clostridium sp000435835",
        "taxonomic classification",
        "microbial shift"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "22_pig_mags_TE3",
      "question": "Describe the bioinformatic workflow and specific quality thresholds used to identify, assess, and dereplicate the non-redundant MAGs reported in this research.",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "22_pig_mags.pdf"
      ],
      "expected_answer": "The study utilized MEGAHIT for de novo assembly and MetaBAT 2 for binning. Quality assessment was performed using CheckM, requiring >90% completeness and <5% contamination for high-quality MAGs, which were then dereplicated using dRep with a 99% Average Nucleotide Identity (ANI) threshold.",
      "expected_concepts": [
        "MEGAHIT",
        "MetaBAT 2",
        "CheckM",
        "dRep",
        "99% ANI",
        "completeness and contamination"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "23_swine_cultivation_FR1",
      "question": "How many bacterial colonies were isolated in total, and how many distinct sub-ASVs were identified within the Lactobacillus ASV5 group?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "23_swine_cultivation.pdf"
      ],
      "expected_answer": "The study isolated a total of 1,299 colonies, which clustered into 148 different bacterial taxa across 28 genera. Specifically, near-full-length 16S rRNA sequencing revealed the presence of 13 sub-ASVs within the single V4-defined ASV5 (Lactobacillus).",
      "expected_concepts": [
        "1,299 colonies",
        "148 bacterial taxa",
        "13 sub-ASVs",
        "Lactobacillus ASV5",
        "16S rRNA sequencing"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "23_swine_cultivation_CO2",
      "question": "How does the microbial diversity identified by culture-dependent (CD) methods compare to culture-independent (CI) methods across the four growth stages, and what does this imply about the overlap between the two approaches?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "23_swine_cultivation.pdf"
      ],
      "expected_answer": "Culture-dependent (CD) methods detected significantly higher microbial diversity (measured in ASVs) than culture-independent (CI) methods at all stages, such as 823 vs 555 ASVs at the finishing stage. However, only about half (45.2% to 56.4%) of the ASVs detected by CI methods were actually recovered using the 53 culture methods, suggesting that while CD methods capture more total variants, many taxa seen in sequencing remain uncultured.",
      "expected_concepts": [
        "CD vs CI diversity",
        "ASV counts",
        "Growth stages",
        "Recovery rate",
        "Overlap"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "23_swine_cultivation_TE3",
      "question": "Based on the PERMANOVA analysis, what was the most significant factor influencing the culturable bacterial community composition, and what specific statistical result supports this?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "23_swine_cultivation.pdf"
      ],
      "expected_answer": "Oxygen was identified as the primary factor shaping the culturable bacterial community. This was supported by PERMANOVA analysis, where oxygen ranked as the top driver with a mean F-score of 142.1 (P < 0.001).",
      "expected_concepts": [
        "Oxygen",
        "PERMANOVA",
        "F-score",
        "142.1",
        "culturable bacterial community"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "24_chicken_microbiome_FR1",
      "question": "Based on the longitudinal occurrence analysis over the 42-day study period, how many microbial genera were classified into the 'colonization', 'disappearance', and 'core' categories?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "24_chicken_microbiome.pdf"
      ],
      "expected_answer": "The study identified three distinct longitudinal occurrence patterns: 107 genera were classified in the 'colonization' category, 24 genera were in the 'disappearance' category, and 18 genera belonged to the 'core' category that remained present throughout the 42-day period.",
      "expected_concepts": [
        "colonization",
        "disappearance",
        "core category",
        "107 genera",
        "24 genera",
        "18 genera"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "24_chicken_microbiome_CO2",
      "question": "How does the choice between Relative Microbiome Profiling (RMP) and Quantitative Microbiome Profiling (QMP) affect the interpretation of microbial interactions and community variations?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "24_chicken_microbiome.pdf"
      ],
      "expected_answer": "RMP introduces significant bias by overestimating variations within microbial communities and failing to identify many significant positive taxon-taxon interactions that are captured by QMP. This discrepancy is particularly notable in the mycobiome, where RMP misses many correlations discovered through quantitative methods.",
      "expected_concepts": [
        "bias",
        "overestimated variations",
        "taxon-taxon interactions",
        "mycobiome",
        "quantitative profiling"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "24_chicken_microbiome_TE3",
      "question": "Using the ecological modeling metrics employed in the study, what are the primary assembly processes for bacterial versus fungal communities in the chicken gut?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "24_chicken_microbiome.pdf"
      ],
      "expected_answer": "The study utilized beta nearest taxon index (\u03b2NTI) and Raup-Crick (RCBray) metrics to distinguish assembly processes. It determined that bacterial communities are primarily driven by deterministic processes (accounted for 56.7% of assembly), whereas fungal communities are governed more by stochastic processes (accounted for 55.1% of assembly).",
      "expected_concepts": [
        "beta nearest taxon index",
        "\u03b2NTI",
        "deterministic processes",
        "stochastic processes",
        "RCBray"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "25_dairy_cow_feed_FR1",
      "question": "Which specific genomic regions were identified as co-localizing the rumen microbiome and feed efficiency traits, and where was the cluster on BTA11 located?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "25_dairy_cow_feed.pdf"
      ],
      "expected_answer": "Significant genomic regions co-localizing the rumen microbiome and feed efficiency traits were identified on BTA3, BTA7, and BTA11. Specifically, on BTA11, two significant co-localized SNPs between dry matter intake (DMI) and the rumen microbiome clustered in the region of 6.1\u20136.2 Mb.",
      "expected_concepts": [
        "BTA3",
        "BTA7",
        "BTA11",
        "6.1\u20136.2 Mb",
        "co-localization"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "25_dairy_cow_feed_CO2",
      "question": "How does the host genome at Chr3: 116.5 Mb influence residual feed intake (RFI), and what role do microbes play in this relationship?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "25_dairy_cow_feed.pdf"
      ],
      "expected_answer": "The cow genome at Chr3: 116.5 Mb does not affect residual feed intake (RFI) directly; instead, it affects RFI indirectly through mediation. This relationship is mediated by the abundance of specific microbes, including Syntrophococcus, Prevotella, and an unknown genus belonging to the Class Bacilli.",
      "expected_concepts": [
        "indirect effect",
        "mediation",
        "Syntrophococcus",
        "Prevotella",
        "Class Bacilli",
        "RFI"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "25_dairy_cow_feed_TE3",
      "question": "What specific algorithm was used for causal structural learning in the study, and how was the stability of the resulting causal networks evaluated?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "25_dairy_cow_feed.pdf"
      ],
      "expected_answer": "The researchers used the Incremental Association Markov Blanket (IAMB) algorithm for causal structural learning, which utilized Fisher\u2019s Z test for conditional independence. The stability of the resulting causal networks was evaluated using Jackknife resampling.",
      "expected_concepts": [
        "Incremental Association Markov Blanket",
        "IAMB",
        "Jackknife resampling",
        "Fisher\u2019s Z test",
        "causal structural learning"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "26_gut_meat_quality_FR1",
      "question": "Which specific bacterial genera are positively correlated with higher intramuscular fat (IMF) content in pigs according to the gut microbiome studies cited?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "26_gut_meat_quality.pdf"
      ],
      "expected_answer": "According to the paper, the specific bacterial genera positively correlated with higher intramuscular fat (IMF) content in pigs are Ruminococcaceae_NK4A214_group, Parabacteroides, and Christensenellaaceae_R-7_group. These genera were found to have higher relative abundances in correlation with increased IMF.",
      "expected_concepts": [
        "Ruminococcaceae_NK4A214_group",
        "Parabacteroides",
        "Christensenellaaceae_R-7_group",
        "intramuscular fat",
        "pigs"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "26_gut_meat_quality_CO2",
      "question": "How do short-chain fatty acids (SCFAs) bridge the gap between dietary fiber intake and the regulation of host fat accumulation and metabolism?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "26_gut_meat_quality.pdf"
      ],
      "expected_answer": "SCFAs are produced through the microbial fermentation of dietary fiber. They regulate host health and meat quality by acting as signaling molecules; specifically, acetate-dependent stimulation of GPR43 in white adipose tissue improves glucose and lipid metabolism, while other SCFA-driven pathways involve the modulation of TLR4, mTOR, and PPAR signaling to influence fat accumulation.",
      "expected_concepts": [
        "microbial fermentation",
        "GPR43",
        "mTOR",
        "PPAR",
        "lipid metabolism",
        "signaling pathways"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "26_gut_meat_quality_TE3",
      "question": "What metric is used to quantify the impact of the gut microbiome on rabbit finishing weight, and which specific host genes are highlighted as part of the microbial-influenced pathway for intramuscular fat accumulation?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "26_gut_meat_quality.pdf"
      ],
      "expected_answer": "The gut microbiome explains nearly 11% of the variation in the finishing weight of rabbits. In terms of molecular pathways, the host genes Fabp9 and Scd1 are specifically identified as being related to intramuscular fat accumulation in connection with gut microorganisms like Prevotella and Alistipes.",
      "expected_concepts": [
        "11% variation",
        "finishing weight",
        "Fabp9",
        "Scd1",
        "intramuscular fat"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "27_inap_pipeline_FR1",
      "question": "What are the names of the two specialized pipelines integrated into iNAP, and what specific types of microbial associations do they represent?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "27_inap_pipeline.pdf"
      ],
      "expected_answer": "iNAP integrates the molecular ecological network analysis pipeline (MENAP) and the interdomain ecological network analysis pipeline (IDENAP). These pipelines correspond to the intradomain and interdomain associations of microbial species, respectively.",
      "expected_concepts": [
        "MENAP",
        "IDENAP",
        "intradomain",
        "interdomain",
        "microbial species"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "27_inap_pipeline_CO2",
      "question": "How does the iNAP system facilitate the visualization and downstream interpretation of the generated ecological networks?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "27_inap_pipeline.pdf"
      ],
      "expected_answer": "iNAP provides automated visualization outputs that are directly compatible with major public-friendly software programs, specifically Cytoscape and Gephi. This allows users to interpret the data by visualizing networks through different lenses, such as coloring by taxonomic groups or module compartments to reveal the underlying biological structure.",
      "expected_concepts": [
        "Cytoscape",
        "Gephi",
        "taxonomic group",
        "module compartments",
        "network visualization"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "27_inap_pipeline_TE3",
      "question": "Which specific algorithms and methods does iNAP recommend for constructing networks from non-temporal versus temporal microbial datasets?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "27_inap_pipeline.pdf"
      ],
      "expected_answer": "For non-temporal datasets, iNAP recommends SparCC, SPIEC-EASI, and RMT-based Pearson's and Spearman's correlations. For temporal datasets, the system suggests using eLSA/LA along with RMT-based Pearson's and Spearman's approaches to identify associations.",
      "expected_concepts": [
        "SparCC",
        "SPIEC-EASI",
        "eLSA",
        "Random Matrix Theory (RMT)",
        "non-temporal",
        "temporal"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "28_otu_vs_asv_FR1",
      "question": "How many total taxa were identified in the full dataset by the OTU-based pipeline compared to the ASV-based pipeline prior to filtering?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "28_otu_vs_asv.pdf"
      ],
      "expected_answer": "According to the study's results and Table 1, the OTU-based pipeline identified 67,015 taxa in the full dataset. In contrast, the ASV-based pipeline reported significantly fewer unique sequences, identifying only 8,005 taxa.",
      "expected_concepts": [
        "67,015",
        "8,005",
        "OTU-based pipeline",
        "ASV-based pipeline",
        "Table 1"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "28_otu_vs_asv_CO2",
      "question": "How does the choice of bioinformatic pipeline influence the interpretation of microbial community diversity and composition across different sample types?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "28_otu_vs_asv.pdf"
      ],
      "expected_answer": "The choice of pipeline results in deviations in community composition ranging from 6.75% to 10.81%, with the OTU-based approach generally reporting higher alpha-diversity and significantly more unique taxa. These inconsistencies are not uniform across all samples; they are more pronounced in biological samples with high variability, such as primary and waste-activated sewage sludge (PWASS), compared to more stable reactor treatments.",
      "expected_concepts": [
        "alpha-diversity",
        "community composition",
        "pipeline-based deviation",
        "PWASS",
        "variability"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "28_otu_vs_asv_TE3",
      "question": "What specific software tools and clustering parameters were utilized to define the OTU-based and ASV-based pipelines in this research?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "28_otu_vs_asv.pdf"
      ],
      "expected_answer": "The OTU-based pipeline utilized VSEARCH for sequence clustering using a 97% similarity threshold. The ASV-based pipeline was implemented using the DADA2 package, which employs specific sequencing error correction algorithms to identify exact amplicon sequence variants rather than clustering by a fixed percentage.",
      "expected_concepts": [
        "VSEARCH",
        "97% similarity",
        "DADA2",
        "error correction algorithms",
        "clustering"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "29_16s_best_practices_FR1",
      "question": "What is the recommended target for raw read numbers per sample when starting a 16S rRNA gene sequencing study without prior pilot data?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "29_16s_best_practices.pdf"
      ],
      "expected_answer": "In the absence of pilot data for a specific matrix, the paper recommends a raw read number target of 10,000 reads per sample. This serves as a good starting point for researchers to ensure sufficient sequencing depth.",
      "expected_concepts": [
        "10,000 reads",
        "raw read number",
        "pilot data",
        "16S rRNA gene sequencing"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "29_16s_best_practices_CO2",
      "question": "Why are Amplicon Sequence Variants (ASVs) considered superior to Operational Taxonomic Units (OTUs) for modern microbiome comparative analyses?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "29_16s_best_practices.pdf"
      ],
      "expected_answer": "ASVs are preferred because they can distinguish sequencing errors from true biological variation, allowing for finer biological resolution. Furthermore, unlike OTUs, ASVs can be independently generated and compared across different studies, making them more suitable for large-scale or meta-analysis research.",
      "expected_concepts": [
        "sequencing errors",
        "biological variation",
        "cross-study comparison",
        "taxonomic resolution"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "29_16s_best_practices_TE3",
      "question": "How does the technical approach of rarefaction differ from scaling normalization methods like TMM or DESeq2 in handling sample variance?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "29_16s_best_practices.pdf"
      ],
      "expected_answer": "Rarefaction equalizes sample variance by subsampling data to a common depth, which has the drawback of discarding potentially valuable data and introducing artificial uncertainty. In contrast, scaling normalization methods like TMM or DESeq2's median of ratios use mathematical transformations to normalize data without the need to discard sequences.",
      "expected_concepts": [
        "rarefaction",
        "subsampling",
        "TMM",
        "DESeq2",
        "scaling normalization",
        "artificial uncertainty"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "30_aquaculture_metagenomics_FR1",
      "question": "What specific percentages of prokaryotes, eukaryotes, archaea, and viruses were identified using the shotgun metagenomics data?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "30_aquaculture_metagenomics.pdf"
      ],
      "expected_answer": "According to the shotgun metagenomics data, the community profile consisted of 75.55% prokaryotes, 23.97% eukaryotes, 0.24% archaea, and 0.24% viruses.",
      "expected_concepts": [
        "shotgun metagenomics",
        "prokaryotes",
        "eukaryotes",
        "archaea",
        "viruses"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "30_aquaculture_metagenomics_CO2",
      "question": "How do the three sequencing approaches (16S rRNA, PacBio long-read, and shotgun metagenomics) differ in their contribution to the study's characterization of the microbial environment?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "30_aquaculture_metagenomics.pdf"
      ],
      "expected_answer": "16S rRNA sequencing is used to track spatio-temporal patterns and alpha richness across samples. PacBio long-read amplicons provide higher taxonomic resolution to identify specific species and pathogens, while shotgun metagenomics provides a cross-domain view that includes non-bacterial organisms such as fungi and viruses.",
      "expected_concepts": [
        "spatio-temporal patterns",
        "taxonomic resolution",
        "pathogen detection",
        "non-bacterial domains",
        "community composition"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "30_aquaculture_metagenomics_TE3",
      "question": "Based on the comparative analysis of the Earth (V4) and MiSeq (V3-V4) primers, what were the quantitative differences in alpha richness and how did this impact the interpretation of community structure?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "30_aquaculture_metagenomics.pdf"
      ],
      "expected_answer": "The Earth primers (V4) yielded a higher alpha richness (1070\u20132240 ASVs) compared to the MiSeq primers (V3-4: 441\u20131962 ASVs). However, the choice of primer did not impact the observed community structure, as both primers resulted in identical patterns when analyzed via Multidimensional scaling (MDS) plots.",
      "expected_concepts": [
        "alpha richness",
        "ASVs",
        "V4 region",
        "V3-V4 region",
        "MDS plots",
        "community structure"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "31_amr_one_health_FR1",
      "question": "According to the World Health Organization's 2019 data, how many of the 32 antimicrobials in hospital development were classified as innovative?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "31_amr_one_health.pdf"
      ],
      "expected_answer": "In 2019, the World Health Organization (WHO) identified 32 antimicrobials in hospital development, but only six of these were classified as innovative. This finding highlights a critical lack of new and innovative treatments in the development pipeline.",
      "expected_concepts": [
        "WHO",
        "32 antimicrobials",
        "six innovative",
        "hospital development",
        "lack of new treatments"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "31_amr_one_health_CO2",
      "question": "How does the metabolism and excretion of antimicrobials in animals contribute to environmental concerns within the One Health framework?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "31_amr_one_health.pdf"
      ],
      "expected_answer": "Within the One Health perspective, animal health and environmental health are linked by the fact that animals excrete a significant percentage (75%\u201390%) of antimicrobials without them being metabolized. Because the volume of antimicrobials used in animals is greater than in humans, this leads to the widespread dispersion of active substances into the environment, potentially driving resistance.",
      "expected_concepts": [
        "One Health",
        "75%\u201390%",
        "unmetabolized",
        "environmental dispersion",
        "animal excretion"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "31_amr_one_health_TE3",
      "question": "Compare the recorded usage in tons of fluoroquinolones and third- and fourth-generation cephalosporins in animals between Europe (2017) and the USA (2018).",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "31_amr_one_health.pdf"
      ],
      "expected_answer": "In Europe (2017), animal usage was recorded at 18 tons of third- and fourth-generation cephalosporins and 216 tons of fluoroquinolones. In contrast, the USA (2018) recorded higher cephalosporin usage at 31.44 tons but significantly lower fluoroquinolone usage at 23.3 tons.",
      "expected_concepts": [
        "18 tons",
        "216 tons",
        "31.44 tons",
        "23.3 tons",
        "fluoroquinolones",
        "cephalosporins"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "32_amr_livestock_environment_FR1",
      "question": "What is the projected annual human mortality rate associated with antimicrobial resistance by the year 2050 if current trends continue?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "32_amr_livestock_environment.pdf"
      ],
      "expected_answer": "According to the paper, there is an anticipation of 10 million annual AR-associated deaths by 2050 if current trends continue.",
      "expected_concepts": [
        "10 million",
        "annual deaths",
        "2050",
        "current trends"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "32_amr_livestock_environment_CO2",
      "question": "How do the interconnected domains of the One Health framework contribute to the cycle of antimicrobial resistance dissemination?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "32_amr_livestock_environment.pdf"
      ],
      "expected_answer": "The One Health perspective describes a flow where livestock act as significant reservoirs for zoonotic pathogens (like E. coli and Salmonella) carrying resistance genes. These resistance genes and unchanged antibiotics (excreted at rates of 30-90%) are released into soil and water ecosystems, which serve as a 'global resistome' and repository, ultimately allowing resistance to cycle back to humans through environmental and animal contact.",
      "expected_concepts": [
        "One Health",
        "livestock reservoirs",
        "global resistome",
        "antibiotic excretion",
        "AR dissemination"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "32_amr_livestock_environment_TE3",
      "question": "Based on statistical modeling estimates, what was the estimated global antimicrobial usage in livestock for 2020 and what is the projected usage for 2030?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "32_amr_livestock_environment.pdf"
      ],
      "expected_answer": "Global antimicrobial usage in livestock was estimated at 99,502 tons in 2020. It is projected to increase by 8% to reach a total of 107,472 tons by 2030.",
      "expected_concepts": [
        "99,502 tons",
        "107,472 tons",
        "8% increase",
        "2030 projection"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "33_amr_food_animals_FR1",
      "question": "What percentage of antibiotics administered to food animals is estimated to be excreted without being absorbed, and through what primary mediums does this occur?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "33_amr_food_animals.pdf"
      ],
      "expected_answer": "According to the paper, approximately 75% of antibiotics administered to food animals are not absorbed by their bodies. These substances are subsequently excreted into the environment through the animal's feces and urine.",
      "expected_concepts": [
        "75%",
        "excreted",
        "feces",
        "urine",
        "not absorbed"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "33_amr_food_animals_CO2",
      "question": "How do antibiotic-resistant bacteria (ARB) and genes (ARGs) originate in food animal farms and eventually reach the human population through different environmental and occupational pathways?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "33_amr_food_animals.pdf"
      ],
      "expected_answer": "ARB and ARGs spread from farms to humans via a multi-faceted process involving direct contact, environmental dispersion, and the food chain. Occupational risks are high for slaughterhouse workers (e.g., 50% MDR E. coli prevalence), while environmental transmission occurs through bioaerosols that can travel up to 10 km and the contamination of water and soil that eventually enters the human food supply or healthcare systems.",
      "expected_concepts": [
        "bioaerosols",
        "occupational risk",
        "food chain",
        "environmental contamination",
        "transmission pathways",
        "direct contact"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "33_amr_food_animals_TE3",
      "question": "Based on the findings regarding novel mitigation technologies, what is the specific mechanism and efficiency of using graphene oxide (GO) nanosheets for water treatment?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "33_amr_food_animals.pdf"
      ],
      "expected_answer": "Graphene oxide (GO) nanosheets are used to remove antibiotic resistance genes (ARGs) from water with an efficiency of approximately 80%. This removal is achieved through a specific chemical mechanism known as \u03c0-stacking (pi-stacking) interactions between the ARGs and the nanosheets.",
      "expected_concepts": [
        "Graphene oxide",
        "80% removal",
        "\u03c0-stacking",
        "nanosheets",
        "ARGs"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "34_one_health_framework_FR1",
      "question": "What are the estimated annual public health and economic benefits of the PulseNet laboratory network according to the paper?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "34_one_health_framework.pdf"
      ],
      "expected_answer": "The PulseNet laboratory network is estimated to avert 270,000 foodborne illnesses and save US $507 million annually. These results are achieved through the effective sharing of laboratory data.",
      "expected_concepts": [
        "PulseNet",
        "270,000 foodborne illnesses",
        "US $507 million",
        "data sharing"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "34_one_health_framework_CO2",
      "question": "How does the Generalized One Health Framework (GOHF) aim to improve zoonotic disease programming?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "34_one_health_framework.pdf"
      ],
      "expected_answer": "The GOHF improves zoonotic disease programming by providing a five-step stepwise framework coupled with a resource toolkit that synthesizes successful existing processes and idealized models. It distinguishes between overarching One Health systems and specific disease programs, utilizing cross-cutting technical domains to achieve defined objectives and outcomes at each stage.",
      "expected_concepts": [
        "Generalized One Health Framework",
        "five-step stepwise framework",
        "resource toolkit",
        "zoonotic disease programming",
        "technical domains"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "34_one_health_framework_TE3",
      "question": "According to the paper, what is the specific threshold for dog vaccination coverage required for rabies control, and how does smartphone-based H5N1 detection compare to traditional methods?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "34_one_health_framework.pdf"
      ],
      "expected_answer": "For rabies control, a critical threshold of 70% or higher vaccination coverage in dog populations is required to reduce human cases. Regarding H5N1 avian influenza, smartphone-based diagnostic systems achieve a two-fold higher detectability than traditional fluorescent strip readers in clinical samples.",
      "expected_concepts": [
        "70% vaccination coverage",
        "rabies",
        "H5N1 avian influenza",
        "two-fold higher detectability",
        "fluorescent strip readers"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "35_isse_framework_amr_FR1",
      "question": "What are the five distinct levels of evaluation defined in the Integrated Surveillance System Evaluation (ISSE) framework?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "35_isse_framework_amr.pdf"
      ],
      "expected_answer": "The ISSE framework consists of five levels: level 1 is integration, level 2 is the production of information, level 3 is the generation of knowledge, level 4 is the influence on decisions, and level 5 is the impact on outcomes. These levels were developed using a participatory process to assess the entire value chain of a surveillance system.",
      "expected_concepts": [
        "integration",
        "production of information",
        "generation of knowledge",
        "influence on decisions",
        "impact on outcomes"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "35_isse_framework_amr_CO2",
      "question": "Why did the authors conclude that existing evaluation tools like ATLASS and NEOH were insufficient for assessing the full value of integrated surveillance systems?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "35_isse_framework_amr.pdf"
      ],
      "expected_answer": "Based on a mapping exercise of 12 existing tools, the authors found that none of the tools covered all five levels of the ISSE framework. While tools like ATLASS cover multiple levels, most focus heavily on the early stages like integration (Level 1) or information production (Level 2), leaving a gap in evaluating how surveillance actually influences decisions and impacts public health outcomes.",
      "expected_concepts": [
        "review of 12 existing tools",
        "none specifically developed for OH integrated surveillance",
        "coverage gaps in evaluation levels",
        "decision-making",
        "impact on outcomes"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "35_isse_framework_amr_TE3",
      "question": "How does the ISSE framework technically measure the level of One Health integration across different surveillance activities?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "35_isse_framework_amr.pdf"
      ],
      "expected_answer": "The framework utilizes a semi-quantitative measurement scale ranging from 0 to 5 to assess the degree of integration. This measurement is specifically applied across four core surveillance activities: data collection, analysis, interpretation, and dissemination.",
      "expected_concepts": [
        "semi-quantitative scale",
        "0-5 scale",
        "collection",
        "analysis",
        "interpretation",
        "dissemination"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "36_integrated_amr_surveillance_FR1",
      "question": "How many integrated surveillance systems for ABR were identified in the review, and what is their geographic distribution?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "36_integrated_amr_surveillance.pdf"
      ],
      "expected_answer": "The scoping review identified 14 distinct integrated surveillance systems for ABR. All of these systems operate exclusively in high-income countries, with 11 located in Europe, 2 in North America, and 1 in Asia.",
      "expected_concepts": [
        "14 systems",
        "high-income countries",
        "Europe",
        "North America",
        "Asia"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "36_integrated_amr_surveillance_CO2",
      "question": "Based on the study's findings, what is the current state of knowledge regarding the outcomes and impacts of integrating ABR surveillance systems?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "36_integrated_amr_surveillance.pdf"
      ],
      "expected_answer": "The study concludes that evidence regarding the actual outcomes and impacts of integration within these surveillance systems is sparse and poorly described in existing literature. While the systems can be categorized by their structural and informational integration, the specific benefits or results stemming from this integration remain under-evidenced.",
      "expected_concepts": [
        "sparse evidence",
        "outcomes",
        "impacts",
        "poorly described",
        "integration"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "36_integrated_amr_surveillance_TE3",
      "question": "Describe the statistical methodology and the number of variables used to generate the typology of the identified surveillance systems.",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "36_integrated_amr_surveillance.pdf"
      ],
      "expected_answer": "The researchers conducted a descriptive analysis of 39 organizational and functional variables across domains like scope and collaboration. A subset of 16 variables was then used in Multiple-correspondence analysis (MCA) and Hierarchical cluster analysis (HCA) to group the 14 systems into four distinct typology clusters based on their levels of informational and structural integration.",
      "expected_concepts": [
        "39 variables",
        "16 variables for MCA/HCA",
        "Multiple-correspondence analysis (MCA)",
        "Hierarchical cluster analysis (HCA)",
        "four typology clusters",
        "informational and structural integration"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "37_data_driven_one_health_FR1",
      "question": "According to the World Bank, what specific percentage of operational savings can be achieved by co-locating human and animal health laboratories?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "37_data_driven_one_health.pdf"
      ],
      "expected_answer": "Hosting human and animal health laboratories under one roof can provide 26% savings in operational costs, as cited from the World Bank [15] in Table 1.",
      "expected_concepts": [
        "26%",
        "operational costs",
        "laboratories",
        "World Bank",
        "one roof"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "37_data_driven_one_health_CO2",
      "question": "How does the One Health multisectoral framework theoretically reduce disease incidence and economic damage during a zoonotic spillover event?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "37_data_driven_one_health.pdf"
      ],
      "expected_answer": "The One Health framework utilizes integrated surveillance and early detection to identify infections as they amplify from wild animals through livestock. By intervening early, the system reduces disease incidence in both humans and animals, thereby preventing the severe economic damage seen in cases like Sierra Leone's GDP drop during the Ebola epidemic.",
      "expected_concepts": [
        "early detection",
        "integrated surveillance",
        "amplification",
        "economic damage",
        "disease incidence"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "37_data_driven_one_health_TE3",
      "question": "Regarding the machine learning modeling referenced from Mollentze et al., what features were used for zoonotic risk prediction and what was the reported accuracy for high-potential viruses?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "37_data_driven_one_health.pdf"
      ],
      "expected_answer": "The machine learning models utilized viral and human genome sequence features to predict risk. These models successfully identified 70.8% of human viruses that possessed high or very high zoonotic potential.",
      "expected_concepts": [
        "viral and human genome sequence features",
        "70.8%",
        "zoonotic potential",
        "Mollentze et al.",
        "machine learning"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "38_one_health_africa_FR1",
      "question": "What is the reported case fatality rate for bat-originated viral zoonotic diseases in Africa according to the paper?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "38_one_health_africa.pdf"
      ],
      "expected_answer": "The paper reports a 61% case fatality rate for bat-originated viral zoonotic diseases in Africa. This high rate contributes to the disproportionately fatal burden of zoonotic diseases on the continent.",
      "expected_concepts": [
        "61%",
        "case fatality rate",
        "bat-originated",
        "viral zoonotic diseases",
        "Africa"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "38_one_health_africa_CO2",
      "question": "How does the paper justify the 'business case' for the One Health approach in the context of zoonotic disease prevention?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "38_one_health_africa.pdf"
      ],
      "expected_answer": "The economic justification, or 'business case,' for the One Health approach is based on the high cost-effectiveness of primary prevention. The paper notes that prevention actions cost less than 1/20th of the value of lives lost each year to these diseases, making a strong argument for institutionalizing these strategies to mitigate the high burden of illness and death.",
      "expected_concepts": [
        "business case",
        "primary prevention",
        "cost-effective",
        "1/20th",
        "value of lives lost",
        "One Health approach"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "38_one_health_africa_TE3",
      "question": "According to the disease burden metrics provided, what percentages of existing and emerging infectious diseases in humans are zoonotic, and what is the resulting impact in low- and middle-income countries?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "38_one_health_africa.pdf"
      ],
      "expected_answer": "The paper cites that 60% of existing and 75% of newly emerging infectious diseases in humans are zoonotic. These diseases result in an estimated 2.5 billion cases of human illness and 2.7 million deaths annually within low- and middle-income countries.",
      "expected_concepts": [
        "60%",
        "75%",
        "emerging infectious diseases",
        "2.5 billion cases",
        "2.7 million deaths",
        "low- and middle-income countries"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "39_zoonotic_southeast_asia_FR1",
      "question": "According to the study's findings, what was the reported seroprevalence of Japanese encephalitis in pigs in Vietnam in 2009?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "39_zoonotic_southeast_asia.pdf"
      ],
      "expected_answer": "In 2009, the reported seroprevalence (percentage of positive cases) for Japanese encephalitis in pigs in Vietnam reached 100%.",
      "expected_concepts": [
        "Japanese encephalitis",
        "Vietnam",
        "100%",
        "2009",
        "seroprevalence"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "39_zoonotic_southeast_asia_CO2",
      "question": "How does the research focus on different pathogen types (viral, bacterial, and parasitic) compare across the reviewed literature, and which countries were identified as high priority for pig and poultry zoonoses?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "39_zoonotic_southeast_asia.pdf"
      ],
      "expected_answer": "The literature is predominantly focused on parasitic pathogens, which account for 52% of studies, followed by bacterial pathogens at 34% and viral pathogens at 14%. Within this regional context, Cambodia and Vietnam are identified as high-priority areas due to having the highest number of reported studies regarding zoonotic diseases originating from pigs and poultry.",
      "expected_concepts": [
        "parasitic pathogens (52%)",
        "bacterial pathogens (34%)",
        "viral pathogens (14%)",
        "Cambodia",
        "Vietnam",
        "high priority"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "39_zoonotic_southeast_asia_TE3",
      "question": "Describe the systematic methodology used to select the final 95 publications from the initial search results, including the databases and criteria involved.",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "39_zoonotic_southeast_asia.pdf"
      ],
      "expected_answer": "The researchers conducted a systematic literature review by searching PubMed, Web of Science, and Science Direct, which initially retrieved 2,329 articles. These were subjected to a two-stage screening process using predefined inclusion and exclusion criteria (such as peer-reviewed original research and specific timeframes), ultimately resulting in 95 publications selected for qualitative synthesis. Data from these articles were then extracted into a standardized template and analyzed using a One Health approach framework.",
      "expected_concepts": [
        "2,329 articles",
        "95 publications",
        "inclusion/exclusion criteria",
        "PubMed/Web of Science/Science Direct",
        "One Health approach",
        "qualitative synthesis"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "40_animal_agriculture_one_health_FR1",
      "question": "What percentage of all human pathogens and emerging diseases affecting humans are estimated to be zoonotic in origin?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "40_animal_agriculture_one_health.pdf"
      ],
      "expected_answer": "According to the paper, approximately 60% of all human pathogens and 75% of emerging diseases affecting humans are zoonotic. These diseases are estimated to cause about one billion cases worldwide each year.",
      "expected_concepts": [
        "60% of human pathogens",
        "75% of emerging diseases",
        "zoonotic",
        "one billion cases"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "40_animal_agriculture_one_health_CO2",
      "question": "How does animal agriculture facilitate the development and transmission of antimicrobial resistance (AMR) from farms to the human population?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "40_animal_agriculture_one_health.pdf"
      ],
      "expected_answer": "Animal agriculture facilitates AMR through the high-volume use of antimicrobials for non-therapeutic purposes like growth promotion, which creates selective pressure for resistant bacteria. These resistant strains develop through biological mechanisms like horizontal gene transfer (HGT) and vertical transmission, eventually spreading to humans via direct contact, contaminated food products, and environmental routes.",
      "expected_concepts": [
        "growth promotion",
        "horizontal gene transfer (HGT)",
        "environmental contamination",
        "non-therapeutic use",
        "selective pressure"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "40_animal_agriculture_one_health_TE3",
      "question": "Based on the paper's findings, what are the specific projected global health and economic impacts of antimicrobial resistance (AMR) by the year 2050?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "40_animal_agriculture_one_health.pdf"
      ],
      "expected_answer": "By the year 2050, antimicrobial resistance is predicted to result in 10 million human deaths annually. Furthermore, the economic burden is expected to be significant, with global healthcare costs exceeding $1 trillion USD per year.",
      "expected_concepts": [
        "10 million deaths",
        "$1 trillion USD",
        "2050",
        "healthcare costs",
        "annual mortality"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "41_salmonella_wgs_amr_FR1",
      "question": "Which specific beta-lactam resistance gene was most commonly identified in Salmonella isolates from imported chicken, and what was its prevalence?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "41_salmonella_wgs_amr.pdf"
      ],
      "expected_answer": "The most common determinant for beta-lactam resistance was blaCMY-2, which was found in 54% of the chicken isolates.",
      "expected_concepts": [
        "blaCMY-2",
        "54%",
        "beta-lactam resistance",
        "chicken isolates"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "41_salmonella_wgs_amr_CO2",
      "question": "Based on the findings, how do the antimicrobial resistance (AMR) profiles and serovar distributions differ between Salmonella found in imported raw chicken versus imported edible leaves?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "41_salmonella_wgs_amr.pdf"
      ],
      "expected_answer": "Salmonella from imported raw chicken shows a much higher prevalence of resistance (93%) and multidrug resistance (71%) compared to isolates from edible leaves (3% resistance and 2% MDR). Furthermore, chicken isolates are dominated by specific highly resistant serovars like S. Heidelberg from Brazil, while leaves show a different variety of serovars distributed across various countries of origin as seen in Figure 1.",
      "expected_concepts": [
        "multidrug resistance (MDR)",
        "S. Heidelberg",
        "prevalence",
        "raw chicken",
        "edible leaves"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "41_salmonella_wgs_amr_TE3",
      "question": "What bioinformatics pipeline and specific quantitative threshold did the researchers use to establish genetic relatedness between Salmonella isolates?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "41_salmonella_wgs_amr.pdf"
      ],
      "expected_answer": "The researchers used Whole Genome Sequencing (WGS) on the Illumina HiSeq 2500 platform and analyzed the data using KmerID for identification, MOST for MLST typing, and SNP typing for genomic relationships. They defined isolates as being genetically related if they exhibited a pairwise SNP difference of \u22645.",
      "expected_concepts": [
        "SNP typing",
        "\u22645 SNPs",
        "KmerID",
        "MOST",
        "Illumina HiSeq 2500",
        "pairwise SNP difference"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "42_campylobacter_wgs_thailand_FR1",
      "question": "Which specific sequence type was identified as the most prominent in the study, and what was the assigned number for the novel sequence type discovered?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "42_campylobacter_wgs_thailand.pdf"
      ],
      "expected_answer": "The most prominent sequence type identified was ST 2274, which accounted for 19.2% of the isolates (5 out of 26). The study also discovered a novel sequence type assigned the number ST 13540 by PubMLST.",
      "expected_concepts": [
        "ST 2274",
        "ST 13540",
        "PubMLST",
        "prominent sequence type"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "42_campylobacter_wgs_thailand_CO2",
      "question": "Based on the findings, how do the genotypic markers for resistance in C. jejuni relate to its observed phenotypic resistance against fluoroquinolones?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "42_campylobacter_wgs_thailand.pdf"
      ],
      "expected_answer": "There is a high correlation between genotypic markers and phenotypic resistance in C. jejuni; 95.2% of the isolates possessed gyrA point mutations, which aligns with the finding that 100% of the C. jejuni isolates demonstrated phenotypic resistance to the fluoroquinolones ciprofloxacin and enrofloxacin, as well as nalidixic acid.",
      "expected_concepts": [
        "gyrA mutations",
        "fluoroquinolones",
        "ciprofloxacin",
        "phenotypic resistance",
        "genotype-phenotype correlation"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "42_campylobacter_wgs_thailand_TE3",
      "question": "Describe the bioinformatic tools and methods used for the de novo assembly and the subsequent identification of antimicrobial resistance genes and virulence factors.",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "42_campylobacter_wgs_thailand.pdf"
      ],
      "expected_answer": "The study performed de novo assembly of Illumina MiSeq reads using SPAdes. For bioinformatic analysis, ResFinder was utilized to identify antimicrobial resistance genes (ARGs), while the Virulence Factor Database (VFDB) was used to detect virulence factors and Kmerfinder was used for species identification.",
      "expected_concepts": [
        "SPAdes",
        "ResFinder",
        "VFDB",
        "Kmerfinder",
        "Illumina MiSeq"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "43_esbl_ecoli_food_FR1",
      "question": "What are the reported annual figures for infections and deaths attributed to ESBL-producing E. coli in the European public health sector?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "43_esbl_ecoli_food.pdf"
      ],
      "expected_answer": "In Europe, ESBL-producing E. coli has a significant impact on public health, causing approximately 300,000 infections annually. These infections are associated with an estimated 9,000 deaths.",
      "expected_concepts": [
        "300,000 infections",
        "9,000 deaths",
        "Europe",
        "ESBL-producing E. coli"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "43_esbl_ecoli_food_CO2",
      "question": "How does the paper support the claim that ESBL-producing E. coli transmission constitutes a cycle between animals, the environment, and humans?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "43_esbl_ecoli_food.pdf"
      ],
      "expected_answer": "The paper illustrates complex transmission pathways through an epidemiological diagram involving farms, food industries, and environmental vectors like water and soil. This conceptual framework is supported by genetic analysis showing nearly identical plasmid backbones across various lineages in chickens, cattle, swine, and humans, indicating that the resistance markers are shared across these diverse sectors.",
      "expected_concepts": [
        "transmission pathways",
        "plasmid backbones",
        "environmental contamination",
        "animal-to-human",
        "genetic similarity"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "43_esbl_ecoli_food_TE3",
      "question": "According to the modeling studies reviewed, what specific reduction in ESBL-producing E. coli prevalence within poultry parent stock is necessary to impact human prevalence?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "43_esbl_ecoli_food.pdf"
      ],
      "expected_answer": "Modeling studies indicate that the prevalence of ESBL-producing E. coli in poultry parent stock and their associated broiler environments must be reduced to a threshold of less than 1%. Reaching this specific target is projected to drastically reduce the subsequent prevalence of these bacteria in the human population.",
      "expected_concepts": [
        "modeling studies",
        "parent stock",
        "less than 1%",
        "prevalence reduction",
        "broiler environments"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "44_eu_amr_report_2022_FR1",
      "question": "According to the 2022 data for human Salmonella cases, what was the specific resistance percentage observed for S. Kentucky against ciprofloxacin?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "44_eu_amr_report_2022.pdf"
      ],
      "expected_answer": "An extremely high proportion of 72.7% resistance was noted in S. Kentucky isolates from human cases in 2022.",
      "expected_concepts": [
        "72.7%",
        "S. Kentucky",
        "ciprofloxacin resistance"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "44_eu_amr_report_2022_CO2",
      "question": "Based on the reported resistance levels in 2021-2022, what is the clinical implication regarding the use of fluoroquinolones for treating human Campylobacter infections?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "44_eu_amr_report_2022.pdf"
      ],
      "expected_answer": "Fluoroquinolones can no longer be recommended for the treatment of Campylobacter infections in humans because C. jejuni and C. coli from both human and animal origins showed high to extremely high levels of resistance to these antimicrobials.",
      "expected_concepts": [
        "no longer recommended for treatment",
        "high to extremely high resistance",
        "human and animal origins"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "44_eu_amr_report_2022_TE3",
      "question": "From 2021 onwards, what method was authorized as an alternative to the supplementary (panel 2) phenotypic testing for Salmonella and indicator E. coli isolates showing resistance to extended-spectrum cephalosporins and/or carbapenems?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "44_eu_amr_report_2022.pdf"
      ],
      "expected_answer": "Whole genome sequencing (WGS) was authorized as an alternative method to supplementary phenotypic testing for these resistant isolates.",
      "expected_concepts": [
        "whole genome sequencing",
        "WGS",
        "phenotypic testing alternative",
        "extended-spectrum cephalosporins",
        "carbapenems"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "45_kenya_amr_digital_FR1",
      "question": "How many surveillance sites submitted data to the WHO Global Antimicrobial Resistance and Use Surveillance System (GLASS) during the 2022 data call?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "45_kenya_amr_digital.pdf"
      ],
      "expected_answer": "In the 2022 data call, Kenya submitted data from 16 surveillance sites. This represents a significant increase from the 6 sites that submitted data in 2021.",
      "expected_concepts": [
        "16 surveillance sites",
        "WHO GLASS",
        "2022 data call",
        "reporting capacity"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "45_kenya_amr_digital_CO2",
      "question": "How does the OHAMRS architecture enable a One Health approach to antimicrobial resistance surveillance?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "45_kenya_amr_digital.pdf"
      ],
      "expected_answer": "The OHAMRS integrates human and animal health data by routing information from various surveillance sites into a centralized national Central Data Warehouse (CDW). Using Open Interop middleware for interoperability and DHIS2 for visualization, the system creates a unified platform where 17 of the 42 dashboards are specifically dedicated to monitoring priority pathogens across both sectors.",
      "expected_concepts": [
        "One Health approach",
        "Central Data Warehouse (CDW)",
        "Open Interop middleware",
        "DHIS2",
        "human and animal health integration"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "45_kenya_amr_digital_TE3",
      "question": "What are the three specific methods used by the 17 human health surveillance sites to submit data to the system, and what is the percentage breakdown for each method?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "45_kenya_amr_digital.pdf"
      ],
      "expected_answer": "The data submission methods for the 17 sites include Laboratory Information Management Systems (LIMS) at 43%, MS-Excel templates at 38%, and WHONET at 19%. This variety in submission methods allows for flexibility depending on the technological capacity of the individual laboratory.",
      "expected_concepts": [
        "LIMS (43%)",
        "MS-Excel template (38%)",
        "WHONET (19%)",
        "data submission methods",
        "human health sector"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "46_methylpipe_FR1",
      "question": "According to the performance metrics, how many human promoters can methylPipe profile in approximately 50 seconds using a single core?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "46_methylpipe.pdf"
      ],
      "expected_answer": "In a performance test using a single core and less than 1GB of RAM peak usage, methylPipe is able to profile 100 human promoters in a sample in approximately 50 seconds.",
      "expected_concepts": [
        "100 human promoters",
        "50 seconds",
        "single core",
        "1GB RAM"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "46_methylpipe_CO2",
      "question": "Based on the comparative analysis against other software tools, what distinguishes methylPipe and radMeth from packages such as BiSeq or bsseq for WGBS tasks?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "46_methylpipe.pdf"
      ],
      "expected_answer": "MethylPipe and radMeth are distinguished by their efficiency and scalability; they were the only tools evaluated that successfully completed all proposed Whole Genome Bisulfite Sequencing (WGBS) analysis tasks using standard computational resources. In contrast, other tools like BiSeq, M3D, and bsseq failed to complete all tasks under the same standard resource constraints.",
      "expected_concepts": [
        "WGBS analysis tasks",
        "standard resources",
        "efficiency",
        "radMeth",
        "scalability"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "46_methylpipe_TE3",
      "question": "What statistical algorithms does the system use for identifying Differentially Methylated Regions (DMRs), and what are the resource requirements for genome-wide identification?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "46_methylpipe.pdf"
      ],
      "expected_answer": "The system utilizes Wilcoxon and Kruskal-Wallis tests for the identification of Differentially Methylated Regions (DMRs). For a full genome-wide analysis between two WGBS samples, this process requires approximately 45 minutes of computation time using 10 cores and reaches a peak memory usage of 28GB RAM.",
      "expected_concepts": [
        "Wilcoxon test",
        "Kruskal-Wallis test",
        "45 minutes",
        "10 cores",
        "28GB RAM"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "47_htsflow_FR1",
      "question": "What are the five critical issues identified by the authors for effective NGS data management?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "47_htsflow.pdf"
      ],
      "expected_answer": "The five critical issues identified for NGS data management are structuring raw data, monitoring analysis flow, automatizing and documenting tools, providing ease of use through a Graphical User Interface (GUI), and ensuring data reproducibility via metadata standards.",
      "expected_concepts": [
        "structuring raw data",
        "monitoring analysis flow",
        "automatizing tools",
        "ease of use",
        "data reproducibility",
        "metadata standards"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "47_htsflow_CO2",
      "question": "How does HTS-flow balance the trade-off between user accessibility and analytical flexibility compared to platforms like Galaxy?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "47_htsflow.pdf"
      ],
      "expected_answer": "HTS-flow prioritizes automation and ease of use for non-experienced users by integrating directly with SMITH LIMS and offering predefined modular scripts. While this approach provides higher automation and streamlines the workflow for biologists, it sacrifices some of the granular parameter flexibility available in more manual platforms like Galaxy.",
      "expected_concepts": [
        "automation",
        "parameter flexibility",
        "non-experienced users",
        "LIMS integration",
        "standardization",
        "Galaxy"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "47_htsflow_TE3",
      "question": "Which specific software libraries and packages does HTS-flow utilize to ensure genomic reproducibility and perform secondary analysis of mRNA kinetics?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "47_htsflow.pdf"
      ],
      "expected_answer": "HTS-flow ensures reproducibility by adopting standard Bioconductor metadata libraries, specifically TxDb and BSgenome. For specialized secondary analysis involving the quantification of mRNA synthesis and degradation, the system integrates the INSPEcT package.",
      "expected_concepts": [
        "TxDb",
        "BSgenome",
        "INSPEcT",
        "mRNA synthesis",
        "Bioconductor",
        "reproducibility"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "48_myc_oncogene_FR1",
      "question": "How many primary MYC-induced and repressed genes were identified via RNA-seq following a 16-hour inactivation of MYC?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "48_myc_oncogene.pdf"
      ],
      "expected_answer": "Following a 16-hour short-term inactivation of MYC, RNA-seq analysis identified approximately 1,200\u20131,400 primary MYC-induced and repressed genes. These genes represent the subset that requires continuous MYC activity for their activation or repression.",
      "expected_concepts": [
        "1,200\u20131,400",
        "primary genes",
        "16-hour inactivation",
        "RNA-seq",
        "MYC-induced"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "48_myc_oncogene_CO2",
      "question": "Based on the findings regarding the V394D mutant, what is the functional significance of the MYC-MIZ1 interaction in the context of liver tumor development?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "48_myc_oncogene.pdf"
      ],
      "expected_answer": "The interaction between MYC and the corepressor MIZ1 is essential for liver tumor initiation and the effective repression of specific target genes. Mice with the MYC-V394D mutant, which cannot bind MIZ1, exhibit impaired cellular transformation and significantly delayed tumorigenesis compared to wild-type MYC, indicating that MYC's ability to repress genes via MIZ1 is a critical driver of oncogenesis.",
      "expected_concepts": [
        "MIZ1",
        "tumor initiation",
        "V394D mutant",
        "gene repression",
        "cellular transformation"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "48_myc_oncogene_TE3",
      "question": "How does MYC activity influence RNA Polymerase II (RNAPII) across different gene categories, and how does this relate to the predictive power of MYC-binding profiles?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "48_myc_oncogene.pdf"
      ],
      "expected_answer": "MYC influences gene expression primarily by regulating the loading of RNA Polymerase II (RNAPII) at both activated and repressed loci. Despite this regulatory role, genomic MYC-binding profiles obtained by ChIP-seq are not predictive of whether a gene will be up- or downregulated, because overexpressed MYC associates widely with already active promoters regardless of whether those genes are primary or secondary MYC targets.",
      "expected_concepts": [
        "RNAPII loading",
        "ChIP-seq",
        "active promoters",
        "binding profiles",
        "up- or downregulated"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "49_pdbinder_FR1",
      "question": "What specific Matthews Correlation Coefficient (MCC) and Positive Predictive Value (PPV) did PDBinder achieve on the holo (bound) test set of 239 complex pairs?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "49_pdbinder.pdf"
      ],
      "expected_answer": "According to the findings, on a set of 239 holo complex pairs, PDBinder obtained an MCC of 0.313 with a PPV of 0.413. These metrics characterize the method's performance in identifying binding residues in the bound state of the protein.",
      "expected_concepts": [
        "MCC",
        "PPV",
        "0.313",
        "0.413",
        "holo test set"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "49_pdbinder_CO2",
      "question": "How does PDBinder's performance on apo proteins compare to its performance on holo proteins, and what does this suggest about the method's robustness?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "49_pdbinder.pdf"
      ],
      "expected_answer": "PDBinder maintains robust performance when transitioning from holo to apo proteins, with the MCC only dropping from 0.313 to 0.271 and the PPV from 0.413 to 0.372. This relatively small decrease in performance, compared to other existing tools, suggests that PDBinder's knowledge-based structural motifs are effective at identifying binding sites even when the protein has not undergone ligand-induced conformational changes.",
      "expected_concepts": [
        "apo proteins",
        "performance drop",
        "robustness",
        "structural motifs",
        "ligand-induced changes"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "49_pdbinder_TE3",
      "question": "Explain the role of 'Spatial Clustering' in the PDBinder methodology and the specific parameters used for this step.",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "49_pdbinder.pdf"
      ],
      "expected_answer": "Spatial Clustering is used as a refinement step in the PDBinder pipeline to improve residue predictions. It utilizes a 10 \u00c5 radius filtering step to discard high-propensity predictions that are spatially isolated, thereby focusing the prediction on clustered regions more likely to form a functional binding site.",
      "expected_concepts": [
        "Spatial Clustering",
        "10 \u00c5 radius",
        "filtering",
        "spatially isolated",
        "refine residue predictions"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "50_4cseq_protocol_FR1",
      "question": "According to the paper's subsampling analysis, what is the minimum number of cis-mapped reads required to generate reproducible 4C profiles?",
      "category": "factual_recall",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "50_4cseq_protocol.pdf"
      ],
      "expected_answer": "Subsampling analysis indicates that roughly 50,000\u2013100,000 cis-mapped 4C reads are sufficient to generate reproducible 4C profiles. This allows for reliable contact profile generation even with relatively low sequencing depth.",
      "expected_concepts": [
        "50,000\u2013100,000",
        "cis-mapped reads",
        "reproducible 4C profiles",
        "subsampling analysis"
      ],
      "difficulty": "easy",
      "answerable": true
    },
    {
      "id": "50_4cseq_protocol_CO2",
      "question": "How does the distribution of sequencing reads across the genome serve as an indicator of a successful 4C-seq experiment?",
      "category": "conceptual",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "50_4cseq_protocol.pdf"
      ],
      "expected_answer": "A successful 4C-seq experiment is characterized by a high concentration of reads on the 'cis chromosome,' which is the chromosome containing the viewpoint. Specifically, preferably >60% of the reads should map to this chromosome, reflecting the higher frequency of local chromatin interactions compared to distal or trans-chromosomal interactions.",
      "expected_concepts": [
        "cis chromosome",
        "viewpoint",
        "60% of reads",
        "chromatin interactions",
        "mapping distribution"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "50_4cseq_protocol_TE3",
      "question": "What is the function of the 'motifPosperc' metric within the pipe4C data processing pipeline, and what value indicates high specificity?",
      "category": "technical",
      "subcategory": null,
      "min_core": 50,
      "source_files": [
        "50_4cseq_protocol.pdf"
      ],
      "expected_answer": "The 'motifPosperc' metric represents the percentage of reads in the FASTQ file where the first restriction enzyme (RE) motif position matches the most frequently occurring first RE motif position. To indicate high specificity of the 4C-seq library, this value should typically be greater than 90%.",
      "expected_concepts": [
        "motifPosperc",
        "restriction enzyme motif",
        "90%",
        "specificity",
        "pipe4C"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_001",
      "question": "How do the storage and cost-efficiency optimization strategies in the nf-core framework compare to the scheduling and transparency features of the Snakemake workflow system?",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 5,
      "source_files": [
        "02_snakemake.pdf",
        "03_nfcore_framework.pdf"
      ],
      "expected_answer": "The nf-core framework focuses on cost and storage reduction through data-level optimizations, such as switching from BAM to CRAM to reduce storage by 65% and sharding FastQ files for intra-sample parallelization, leading to a 70% reduction in cloud costs. In contrast, Snakemake optimizes execution through its scheduler using Mixed Integer Linear Programming (MILP) and ensures transparency and efficiency via blockchain-style SHA-256 hashing for result caching. While nf-core targets architectural changes within the pipeline to save resources, Snakemake targets the algorithmic efficiency of the workflow management system itself.",
      "expected_concepts": [
        "CRAM",
        "BAM",
        "MILP",
        "SHA-256 hashing",
        "sharding",
        "cost reduction",
        "scheduling"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_002",
      "question": "How does the 'all-in-one' preprocessing approach of fastp complement the visualization and quality control capabilities of MultiQC in a bioinformatics pipeline?",
      "category": "cross_document",
      "subcategory": "complementary",
      "min_core": 5,
      "source_files": [
        "04_fastp.pdf",
        "05_multiqc.pdf"
      ],
      "expected_answer": "fastp functions as a high-performance C++ preprocessor that integrates filtering, adapter trimming, and QC into a single scan to minimize I/O overhead, which is significantly faster than traditional tools. MultiQC complements this by acting downstream, recursively searching for the log files and metrics generated by tools like fastp to aggregate them into a single, interactive HTML report. This allows researchers to transition from the efficient, sample-specific processing of fastp to a global view of batch effects and trends across an entire project using MultiQC's Jinja2-based reporting.",
      "expected_concepts": [
        "single-scan",
        "I/O overhead",
        "adapter trimming",
        "batch effects",
        "Jinja2",
        "aggregated report",
        "log files"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "CD_003",
      "question": "What common technical methodologies do Sarek, Snakemake, and the nf-core framework share to ensure reproducibility and portability across different computing environments?",
      "category": "cross_document",
      "subcategory": "shared_method",
      "min_core": 5,
      "source_files": [
        "01_sarek.pdf",
        "02_snakemake.pdf",
        "03_nfcore_framework.pdf"
      ],
      "expected_answer": "All three systems rely on containerization (Docker and Singularity) and environment management tools like Conda to ensure software portability. Sarek and the nf-core framework are built on the Nextflow language (utilizing DSL2 for modularity), while Snakemake uses its own Python-based Domain Specific Language (DSL). They all leverage these abstraction layers to allow workflows to run consistently across various infrastructures, ranging from single nodes to large-scale cloud environments like AWS Batch.",
      "expected_concepts": [
        "Docker",
        "Singularity",
        "Nextflow",
        "DSL",
        "modularity",
        "portability",
        "Conda"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "CD_004",
      "question": "How could integrating fastp and the nf-core framework's data management optimizations improve the performance and resource usage of the Sarek WGS workflow?",
      "category": "cross_document",
      "subcategory": "sequential",
      "min_core": 5,
      "source_files": [
        "01_sarek.pdf",
        "03_nfcore_framework.pdf",
        "04_fastp.pdf"
      ],
      "expected_answer": "Integrating fastp into Sarek would accelerate the initial preprocessing stage by 2-5 times due to its multi-threaded C++ implementation and single-scan processing. Furthermore, adopting the nf-core framework's practice of switching internal file formats from BAM to CRAM would reduce the storage footprint of Sarek's work directory by approximately 65%. These improvements would likely reduce the 48-hour processing time for a 90x WGS dataset mentioned in the Sarek paper while significantly lowering the associated compute costs on commercial clouds.",
      "expected_concepts": [
        "CRAM",
        "WGS",
        "single-scan",
        "I/O overhead",
        "storage usage",
        "48 hours",
        "90x coverage"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_005",
      "question": "In the context of scaling large-scale genomic analyses, how do the parallelization strategies of the nf-core framework differ from the job scheduling approach of Snakemake?",
      "category": "cross_document",
      "subcategory": "contrasting",
      "min_core": 5,
      "source_files": [
        "02_snakemake.pdf",
        "03_nfcore_framework.pdf"
      ],
      "expected_answer": "The nf-core framework achieves scalability through intra-sample parallelization by sharding FastQ files into smaller chunks, allowing multiple parts of a single sample to be processed simultaneously. Snakemake, however, approaches scaling by using a scheduler based on Mixed Integer Linear Programming (MILP) to efficiently manage a Directed Acyclic Graph (DAG) of jobs. While nf-core optimizes the data structure to exploit parallel compute, Snakemake optimizes the mathematical allocation of tasks to maximize throughput and accommodate massive job graphs.",
      "expected_concepts": [
        "sharding",
        "FastQ",
        "MILP",
        "Directed Acyclic Graph",
        "DAG",
        "intra-sample parallelization",
        "job scheduling"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_006",
      "question": "How do the implementation languages and performance-optimization strategies of fastp, SeqKit, and Cutadapt differ in addressing computational bottlenecks in sequence data processing?",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 10,
      "source_files": [
        "04_fastp.pdf",
        "09_seqkit.pdf",
        "10_cutadapt.pdf"
      ],
      "expected_answer": "fastp is implemented in C++ and uses a 'single-scan' approach with multi-threading to minimize I/O overhead and improve speed, outperforming traditional tools by 2\u20135 times. In contrast, SeqKit is written in Go and leverages Goroutines for parallelization along with custom data structures like slice-based ASCII indexing for reverse complementation, achieving high cross-platform portability and speed. Cutadapt is primarily Python-based with C extensions for alignment; however, it identifies I/O operations as its main performance bottleneck, whereas fastp specifically integrates multiple functions into one scan to circumvent this issue.",
      "expected_concepts": [
        "C++",
        "Go",
        "Python",
        "single-scan",
        "Goroutines",
        "I/O overhead",
        "parallelization",
        "multi-threading"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_007",
      "question": "In the context of RNA-seq analysis, how do the mapping and quantification methods of STAR and Salmon address technical challenges before downstream differential expression analysis in DESeq2?",
      "category": "cross_document",
      "subcategory": "sequential",
      "min_core": 10,
      "source_files": [
        "06_star_aligner.pdf",
        "07_salmon.pdf",
        "08_deseq2.pdf"
      ],
      "expected_answer": "STAR focuses on ultrafast alignment using Uncompressed Suffix Arrays and a Maximal Mappable Prefix (MMP) search to handle non-contiguous transcript structures and novel splice junctions. Salmon provides an alternative approach using lightweight 'quasi-mapping' and specifically corrects for fragment GC content and other technical biases that can lead to false positives. The results from such tools are then utilized by DESeq2, which applies Empirical Bayes shrinkage to dispersion and fold change estimates to handle the noise and small sample sizes typical in RNA-seq count data.",
      "expected_concepts": [
        "Suffix Arrays",
        "Maximal Mappable Prefix",
        "Quasi-mapping",
        "GC content bias",
        "Empirical Bayes",
        "Shrinkage estimation",
        "Splicing"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_008",
      "question": "How do the methodological approaches of Cutadapt and Salmon differ in their treatment of technical artifacts in RNA-seq data, and what are the specific implications for downstream analysis?",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 10,
      "source_files": [
        "07_salmon.pdf",
        "10_cutadapt.pdf"
      ],
      "expected_answer": "Cutadapt addresses technical artifacts at the sequence level using semi-global (end-space free) alignment to identify and remove adapter sequences and perform quality-based trimming on raw reads. In contrast, Salmon handles systematic technical biases, such as fragment GC content and positional biases, by employing sample-specific bias models and a dual-phase inference algorithm during transcript quantification. While Cutadapt's removal of adapters prevents false mappings, Salmon's correction of GC content bias improves the accuracy of transcript abundance (TPM) estimates, leading to higher sensitivity and reduced false positives in downstream differential expression analysis.",
      "expected_concepts": [
        "semi-global alignment",
        "adapter sequence",
        "fragment GC content bias",
        "TPM",
        "quasi-mapping"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_009",
      "question": "How do the quality control and preprocessing capabilities of fastp and Cutadapt integrate with the visualization goals of MultiQC in high-throughput sequencing pipelines?",
      "category": "cross_document",
      "subcategory": "complementary",
      "min_core": 10,
      "source_files": [
        "04_fastp.pdf",
        "05_multiqc.pdf",
        "10_cutadapt.pdf"
      ],
      "expected_answer": "fastp acts as an all-in-one preprocessor that performs quality control and adapter trimming in a single scan, while Cutadapt provides specialized, error-tolerant adapter removal, particularly for color-space data. Both tools generate log files and metrics during these processes. MultiQC then complements these tools by parsing their diverse output logs and aggregating the metrics into a single HTML report, allowing researchers to visualize trends and identify batch effects across large sample sets that were processed by either fastp or Cutadapt.",
      "expected_concepts": [
        "single-scan",
        "adapter trimming",
        "color-space",
        "batch effects",
        "log files",
        "aggregation",
        "Jinja2"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "CD_010",
      "question": "Contrast the algorithmic approaches used by STAR and SeqKit to manage large-scale genomic sequence data efficiently.",
      "category": "cross_document",
      "subcategory": "contrasting",
      "min_core": 10,
      "source_files": [
        "06_star_aligner.pdf",
        "09_seqkit.pdf"
      ],
      "expected_answer": "STAR utilizes a Suffix Array (SA) and a sequential Maximal Mappable Prefix (MMP) seed search specifically designed for the complexities of RNA-seq alignment, such as splicing and chimeric transcripts. In contrast, SeqKit focuses on general FASTA/Q manipulation using the Go programming language, employing Goroutines for parallelizing CPU-intensive tasks and a 'two-pass' mode for certain operations to maintain a low memory footprint. While STAR optimizes for the high-speed mapping of reads to a reference genome, SeqKit optimizes for the high-speed parsing and transformation of raw sequence files.",
      "expected_concepts": [
        "Suffix Array",
        "Maximal Mappable Prefix",
        "Goroutines",
        "two-pass mode",
        "RNA-seq alignment",
        "FASTA/Q parsing"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_011",
      "question": "Compare the diagnostic methodologies and biological markers used to determine the prevalence of bovine tuberculosis (bTB) and Peste des petits ruminants (PPR) as described in studies from Cameroon and Ethiopia.",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 20,
      "source_files": [
        "14_bovine_tb_cameroon.pdf",
        "16_ppr_ethiopia.pdf"
      ],
      "expected_answer": "In paper 14, bovine tuberculosis (bTB) prevalence in Cameroon is determined using intradermal tests, specifically the Comparative Intradermal Tuberculin Test (CIDT) and the Simple Intradermal Tuberculin Test (SITT), which measure immune response to bovine and avian purified protein derivatives (PPD). In contrast, paper 16 estimates Peste des petits ruminants (PPR) seroprevalence in Ethiopia using a competitive Enzyme-Linked Immunosorbent Assay (c-ELISA) to detect specific antibodies (PPRV). While the bTB study focuses on skin sensitivity to proteins, the PPR study measures herd immunity and virus exposure through serum sample negativity percentages (S/N%).",
      "expected_concepts": [
        "CIDT",
        "PPD",
        "c-ELISA",
        "seroprevalence",
        "intradermal",
        "antibody detection"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "CD_012",
      "question": "How do the challenges of disease control identified in the Serengeti rabies vaccination campaigns relate to the vaccine efficacy findings for Lumpy Skin Disease (LSD)?",
      "category": "cross_document",
      "subcategory": "complementary",
      "min_core": 20,
      "source_files": [
        "15_rabies_tanzania.pdf",
        "20_lsd_review.pdf"
      ],
      "expected_answer": "Paper 15 identifies that spatial heterogeneity in vaccination coverage and incursions from outside the district are the primary drivers of rabies persistence in the Serengeti, even when vaccines are available. Paper 20 complements this by noting that for Lumpy Skin Disease, highly effective biological tools like the homologous Lumpi-ProVacInd vaccine exist and offer perfect efficacy. Together, these papers suggest that while vaccine development (from 20) is a crucial step, the epidemiological success of a control program depends heavily on overcoming the implementation gaps and coverage heterogeneity described in 15.",
      "expected_concepts": [
        "spatial heterogeneity",
        "vaccination coverage",
        "homologous vaccine",
        "Lumpi-ProVacInd",
        "incursions",
        "herd immunity"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_013",
      "question": "How do the clinical manifestations and species-specific impacts of HPAI Clade 2.3.4.4 outbreaks compare between the intensive poultry sector in the Netherlands (2014-2022) and the broader ecological crisis observed in Canada during 2022?",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 20,
      "source_files": [
        "12_hpai_netherlands.pdf",
        "19_hpai_canada.pdf"
      ],
      "expected_answer": "In the Netherlands, HPAI H5Nx clinical signs were found to be highly species-dependent: ducks (Anseriformes) predominantly showed nervous and locomotor signs (66.7%), while chickens (Galliformes) exhibited higher rates of mucosal and skin signs (up to 67.8%). Canada's 2022 H5N1 outbreak (Clade 2.3.4.4b) corroborated the severity of neurological signs, particularly during unprecedented interspecies spillover into mammalian carnivores. While the Dutch study focused on refining early warning criteria through mortality ratios in poultry, the Canadian context emphasized the massive scale of wildlife infection and the genomic evidence of mammalian adaptation (e.g., PB2-E627K mutations) across various species.",
      "expected_concepts": [
        "Clade 2.3.4.4",
        "nervous/locomotor signs",
        "mucosal/skin signs",
        "interspecies spillover",
        "mammalian adaptation",
        "mortality ratio"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_014",
      "question": "How do the risk factors identified for brucellosis in Ethiopian camels and small ruminants compare to the risk factors for Peste des petits ruminants (PPR) in the same region?",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 20,
      "source_files": [
        "16_ppr_ethiopia.pdf",
        "17_brucellosis_ethiopia.pdf"
      ],
      "expected_answer": "Both papers 16 and 17 identify animal-level and management factors as key drivers of disease prevalence in Ethiopia. For PPR (16), animal origin is a major factor, with purchased or gifted animals being at higher risk, and older animals showing higher seropositivity. For brucellosis (17), large herd size and a history of retained fetal membranes (RFM) are the primary risk factors for camels and small ruminants. Both studies use multivariable logistic regression to isolate these epidemiological risks within pastoralist communities.",
      "expected_concepts": [
        "animal origin",
        "herd size",
        "retained fetal membranes (RFM)",
        "seropositivity",
        "logistic regression",
        "pastoralist"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "CD_015",
      "question": "How might the use of ultra-fast preprocessing tools like fastp or specialized trimming tools like Cutadapt facilitate the epidemiological investigation of bovine tuberculosis (bTB)?",
      "category": "cross_document",
      "subcategory": "complementary",
      "min_core": 20,
      "source_files": [
        "04_fastp.pdf",
        "10_cutadapt.pdf",
        "14_bovine_tb_cameroon.pdf"
      ],
      "expected_answer": "Paper 14 discusses the need for better understanding of bTB epidemiology through zoonotic risk factors. To perform high-quality genomic analysis of Mycobacterium bovis, researchers can use fastp (04) for ultra-fast, single-scan FASTQ preprocessing, including adapter trimming and quality filtering, which is 2\u20135 times faster than older tools. Alternatively, Cutadapt (10) could be used if specialized color-space data or gapped alignments are required, ensuring that the sequencing data used for risk factor modeling is free of technical artifacts and adapter contamination.",
      "expected_concepts": [
        "FASTQ preprocessing",
        "adapter trimming",
        "Mycobacterium bovis",
        "Single-scan processing",
        "Gapped alignment",
        "zoonotic risk"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_016",
      "question": "Contrast the epidemiological findings of the 2022 Lumpy Skin Disease (LSD) outbreak in Nawalpur, Nepal, with the general disease characteristics and prevention strategies outlined in global systematic reviews.",
      "category": "cross_document",
      "subcategory": "contrast",
      "min_core": 20,
      "source_files": [
        "13_lsd_nepal.pdf",
        "20_lsd_review.pdf"
      ],
      "expected_answer": "The 2022 outbreak in Nepal recorded a morbidity rate of 28.02% and a mortality rate of 3.06%, consistent with the global range of less than 10% mortality described in systematic reviews. However, the case fatality rate (CFR) of 10.90% (deaths among affected animals) was notably high, and the Nepal study identified specific risk factors for local dry cattle and pregnant cows. The systematic review highlights broader prevention mechanisms, such as the high cross-protection afforded by Sheep and Goat Pox vaccines and the 'perfect efficacy' of newer homologous vaccines like Lumpi-ProVacInd. Both papers agree on the high morbidity of the disease and its status as a significant transboundary threat.",
      "expected_concepts": [
        "morbidity rate",
        "case fatality rate",
        "low mortality",
        "Lumpi-ProVacInd",
        "cross-protection",
        "transboundary disease"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_017",
      "question": "Using a One Health perspective, compare the identified zoonotic risk factors and surveillance priorities for HPAI H5N1 in Canada versus brucellosis in the Somali region of Ethiopia.",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 20,
      "source_files": [
        "19_hpai_canada.pdf",
        "17_brucellosis_ethiopia.pdf"
      ],
      "expected_answer": "Zoonotic risk for HPAI in Canada is primarily linked to viral evolution and interspecies spillover, with surveillance focusing on mammalian adaptation mutations (e.g., PB2-E627K, D701N) and migratory bird flyways. In contrast, the One Health risk for brucellosis in Ethiopia is driven by occupational exposure and livestock management, with significant animal-level risk factors including large herd sizes and a history of retained fetal membranes (RFM) in camels and small ruminants. While Canada emphasizes clinician awareness of neurological signs in humans following wildlife contact, Ethiopia's priorities focus on managing the high seroprevalence (confirmed via cELISA) in pastoralist communities and their livestock.",
      "expected_concepts": [
        "One Health",
        "zoonotic spillover",
        "mammalian adaptation mutations",
        "retained fetal membranes",
        "occupational exposure",
        "seroprevalence"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_018",
      "question": "How do the data management capabilities of MultiQC and SeqKit contrast with the data collection requirements for zoonotic disease studies in Ethiopia and Cameroon?",
      "category": "cross_document",
      "subcategory": "contrasting",
      "min_core": 20,
      "source_files": [
        "05_multiqc.pdf",
        "09_seqkit.pdf",
        "14_bovine_tb_cameroon.pdf",
        "17_brucellosis_ethiopia.pdf"
      ],
      "expected_answer": "Bioinformatics tools like MultiQC (05) and SeqKit (09) provide automated, high-performance methods for aggregating QC metrics and manipulating sequence data (FASTA/Q) across millions of records using parallelized Go and Python architectures. In contrast, the zoonotic studies in papers 14 (bTB) and 17 (brucellosis) rely on manual field data collection, including interviewee-administered questionnaires with professionals and multistage sampling of animal populations. While the bioinformatics tools optimize digital data processing speed, the disease studies focus on capturing complex sociodemographic and environmental metadata through direct human-animal interface interaction.",
      "expected_concepts": [
        "automated aggregation",
        "FASTA/Q manipulation",
        "questionnaires",
        "multistage sampling",
        "sociodemographic data",
        "parallelization"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_019",
      "question": "How do the implementation strategies for NGS workflow management and reproducibility differ between Nextflow-based systems like Sarek, the Snakemake system, and the HTS-flow platform?",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 50,
      "source_files": [
        "01_sarek.pdf",
        "02_snakemake.pdf",
        "47_htsflow.pdf"
      ],
      "expected_answer": "Sarek utilizes Nextflow to ensure portability and scalability through containerization (Docker/Singularity) and environment management (Conda), achieving high accuracy in germline and somatic variant calling. Snakemake focuses on sustainability through a Python-based DSL, efficient job scheduling using Mixed Integer Linear Programming (MILP), and blockchain-style hashing for result caching. In contrast, HTS-flow prioritizes accessibility for non-IT experts by providing a Graphical User Interface (GUI) and direct integration with a Laboratory Information Management System (SMITH LIMS) to manage metadata and traceability.",
      "expected_concepts": [
        "Nextflow",
        "Containerization",
        "DSL",
        "MILP",
        "GUI",
        "LIMS",
        "Reproducibility"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_020",
      "question": "How does Kenya's One Health AMR Surveillance System (OHAMRS) align with the Integrated Surveillance System Evaluation (ISSE) framework's levels of surveillance?",
      "category": "cross_document",
      "subcategory": "complementary",
      "min_core": 50,
      "source_files": [
        "45_kenya_amr_digital.pdf",
        "35_isse_framework_amr.pdf"
      ],
      "expected_answer": "Kenya's OHAMRS aligns with the ISSE framework by integrating human and animal health data into a Central Data Warehouse, addressing the ISSE's 'integration' level. The system uses DHIS2 to provide 42 customizable dashboards, which corresponds to the ISSE's levels of 'production of information' and 'generation of knowledge'. By scaling surveillance sites and diversifying data submission, OHAMRS aims to influence policy and decisions, fulfilling the framework's goal of moving beyond just data collection to impacting health outcomes.",
      "expected_concepts": [
        "OHAMRS",
        "DHIS2",
        "ISSE framework",
        "Integration",
        "Knowledge translation",
        "Decision making"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "CD_021",
      "question": "Contrast the impact of using Operational Taxonomic Units (OTUs) versus Amplicon Sequence Variants (ASVs) on microbial community interpretation as discussed in recent microbiome studies.",
      "category": "cross_document",
      "subcategory": "contrasting",
      "min_core": 50,
      "source_files": [
        "28_otu_vs_asv.pdf",
        "29_16s_best_practices.pdf",
        "23_swine_cultivation.pdf"
      ],
      "expected_answer": "While OTU-based pipelines (like VSEARCH) rely on 97% similarity clustering and often report higher numbers of unique sequences and alpha diversity, ASV-based pipelines (like DADA2) use error correction to capture finer biological variation and allow for valid comparisons across independent studies. In swine cultivation research, ASV-based analysis was used to distinguish between culture-dependent and culture-independent diversity, detecting hundreds of observed ASVs across cultivation methods. Overall, ASVs are increasingly preferred in animal science because they avoid the arbitrary similarity thresholds of OTUs and provide higher resolution for identifying specific taxa.",
      "expected_concepts": [
        "OTU",
        "ASV",
        "DADA2",
        "Denoising",
        "Diversity",
        "Biological variation",
        "97% similarity"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_022",
      "question": "How do the strategies for managing large-scale NGS data efficiency compare between the nf-core framework, the fastp preprocessor, and the methylPipe epigenomics package?",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 50,
      "source_files": [
        "03_nfcore_framework.pdf",
        "04_fastp.pdf",
        "46_methylpipe.pdf"
      ],
      "expected_answer": "The nf-core framework optimizes efficiency by switching from BAM to CRAM formats, reducing storage by 65%, and using intra-sample parallelization via sharding. fastp improves speed and reduces I/O overhead by performing quality control, adapter trimming, and filtering in a single scan of the data using multi-threading. methylPipe handles large epigenomics datasets by utilizing Tabix indexing for fast random access and efficient memory management, allowing genome-wide analysis of DNA methylation within standard computational resources.",
      "expected_concepts": [
        "CRAM",
        "Sharding",
        "Single-scan",
        "I/O overhead",
        "Tabix",
        "DNA methylation"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_023",
      "question": "In what ways do quantitative microbiome profiling (QMP) and culturomics address the biases of traditional relative microbiome profiling (RMP) in livestock research?",
      "category": "cross_document",
      "subcategory": "complementary",
      "min_core": 50,
      "source_files": [
        "21_livestock_gut_microbiome_review.pdf",
        "24_chicken_microbiome.pdf",
        "23_swine_cultivation.pdf"
      ],
      "expected_answer": "Traditional RMP is limited by compositional effects and a bias toward common cosmopolitan breeds, as noted in general livestock reviews. QMP addresses this in chicken studies by combining high-throughput sequencing with qPCR to measure absolute abundance, revealing that RMP can introduce significant bias during the production cycle. Meanwhile, culturomics in swine research uses 53 different cultivation methods to detect higher microbial diversity (ASVs) than culture-independent methods, overcoming the 'uncultured' bias of standard molecular profiling.",
      "expected_concepts": [
        "Quantitative Microbiome Profiling",
        "Relative Microbiome Profiling",
        "qPCR",
        "Absolute abundance",
        "Culturomics",
        "ASVs"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_024",
      "question": "Based on WGS and genomic studies of foodborne pathogens (Salmonella, Campylobacter, and E. coli), how does the prevalence and transmission of antimicrobial resistance vary between different poultry-related sources and geographic regions?",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 50,
      "source_files": [
        "41_salmonella_wgs_amr.pdf",
        "42_campylobacter_wgs_thailand.pdf",
        "43_esbl_ecoli_food.pdf"
      ],
      "expected_answer": "WGS analysis shows that Salmonella resistance is significantly higher in imported raw chicken (especially S. Heidelberg from Brazil) than in edible leaves, with widespread multidrug resistance. Similarly, Campylobacter studies in Thailand report high phenotypic resistance to fluoroquinolones in C. jejuni and C. coli from commercial broilers. These findings are complemented by evidence that ESBL-producing E. coli in poultry share identical plasmid backbones with human isolates, highlighting the significant public health risk of foodborne AMR transmission.",
      "expected_concepts": [
        "WGS",
        "AMR",
        "Salmonella",
        "Campylobacter",
        "S. Heidelberg",
        "Fluoroquinolones",
        "Plasmids"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "CD_025",
      "question": "How do One Health frameworks like GOHF and data-driven approaches incorporate technology to improve zoonotic disease detection and response?",
      "category": "cross_document",
      "subcategory": "shared_method",
      "min_core": 50,
      "source_files": [
        "34_one_health_framework.pdf",
        "37_data_driven_one_health.pdf",
        "45_kenya_amr_digital.pdf"
      ],
      "expected_answer": "The Generalized One Health Framework (GOHF) highlights the use of smartphone-based diagnostic systems for H5N1 and the PulseNet laboratory network to avert foodborne illnesses. Data-driven methods enhance this by utilizing machine learning models to predict zoonotic potential from viral and human genome sequences with high accuracy. In Kenya, this technological approach is manifested in the OHAMRS, which uses digital dashboards and middleware for real-time interoperability between human and animal health sectors to support evidence-based policy.",
      "expected_concepts": [
        "GOHF",
        "Machine learning",
        "OHAMRS",
        "H5N1",
        "PulseNet",
        "Digitalization",
        "Zoonotic potential"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_026",
      "question": "How does the host genome indirectly and directly influence animal phenotypes through the gut microbiome in dairy cows and pigs?",
      "category": "cross_document",
      "subcategory": "complementary",
      "min_core": 50,
      "source_files": [
        "25_dairy_cow_feed.pdf",
        "26_gut_meat_quality.pdf",
        "22_pig_mags.pdf"
      ],
      "expected_answer": "In dairy cows, genomic regions on chromosomes BTA3, BTA7, and BTA11 influence feed efficiency (RFI) indirectly by modulating the abundance of specific microbes like Syntrophococcus and Prevotella. In pigs, the gut microbiome explains variation in finishing weight and meat quality, where specific genera like Ruminococcaceae are positively correlated with intramuscular fat (IMF) content. This microbial influence is linked to functional potentials, such as short-chain fatty acid (SCFA) production and carbohydrate degradation, which are characterized using metagenome-assembled genomes (MAGs).",
      "expected_concepts": [
        "RFI",
        "Syntrophococcus",
        "QTL",
        "Intramuscular fat",
        "IMF",
        "SCFA",
        "MAGs"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_027",
      "question": "Compare the organizational models and economic benefits of integrated One Health surveillance systems as discussed in global reviews and regional case studies.",
      "category": "cross_document",
      "subcategory": "comparison",
      "min_core": 50,
      "source_files": [
        "36_integrated_amr_surveillance.pdf",
        "37_data_driven_one_health.pdf",
        "38_one_health_africa.pdf"
      ],
      "expected_answer": "A scoping review identified 14 integrated surveillance systems for ABR, all in high-income countries, categorized by their level of structural and informational integration. Economic analysis indicates these systems are highly beneficial; for example, integrated surveillance of West Nile virus saved over \u20ac1 million, and shared laboratories reduced operational costs by 26%. While high-income countries have established models, Africa faces a higher burden of zoonotic disease, necessitating regional coordination groups like those under the African Union to harmonize prevention strategies.",
      "expected_concepts": [
        "Structural Integration",
        "Informational Integration",
        "Cost-effective",
        "Operational costs",
        "African Union",
        "ABR"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "CD_028",
      "question": "How do tools for analyzing DNA methylation and chromatin structure complement traditional transcriptomic analysis of oncogene targets?",
      "category": "cross_document",
      "subcategory": "complementary",
      "min_core": 50,
      "source_files": [
        "46_methylpipe.pdf",
        "50_4cseq_protocol.pdf",
        "48_myc_oncogene.pdf"
      ],
      "expected_answer": "Traditional transcriptomic analysis of the MYC oncogene indicates that its binding profiles (via ChIP-seq) do not fully predict gene expression changes, as it only activates or represses half of its targets. Epigenomic tools like methylPipe complement this by integrating DNA methylation data and histone marks to identify differentially methylated regions (DMRs) that may influence gene regulation. Furthermore, 4C-seq allows for the identification of genomic contact frequencies (3D structure) from a specific viewpoint, providing a structural context for how oncogenic transcription factors interact with distant regulatory elements.",
      "expected_concepts": [
        "DNA methylation",
        "4C-seq",
        "DMR",
        "ChIP-seq",
        "MYC",
        "Chromatin structure",
        "Transcriptomics"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "CD_029",
      "question": "What are the shared methods and technical considerations for identifying antimicrobial resistance genes (ARGs) in different food animal environments?",
      "category": "cross_document",
      "subcategory": "shared_method",
      "min_core": 50,
      "source_files": [
        "33_amr_food_animals.pdf",
        "41_salmonella_wgs_amr.pdf",
        "22_pig_mags.pdf"
      ],
      "expected_answer": "Identifying ARGs across poultry and swine environments relies on advanced genomic techniques like Whole Genome Sequencing (WGS) and shotgun metagenomics. In studies of imported chicken and leaves, algorithms like Genefinder are used to identify chromosomal mutations and resistance determinants. In swine research, de novo assembly of metagenome-assembled genomes (MAGs) allows for the association of ARGs with specific uncultured taxa. A critical technical consideration across these environments is the ability of ARB and ARGs to spread via bioaerosols or manure, often mediated by mobile genetic elements (MGEs).",
      "expected_concepts": [
        "WGS",
        "Shotgun Metagenomics",
        "MAGs",
        "ARGs",
        "Genefinder",
        "MGEs",
        "Bioaerosols"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "CD_030",
      "question": "How do Salmon's bias correction and DESeq2's statistical modeling improve RNA-seq analysis, and how are these processes managed in high-throughput workflows like HTS-flow?",
      "category": "cross_document",
      "subcategory": "sequential",
      "min_core": 50,
      "source_files": [
        "07_salmon.pdf",
        "08_deseq2.pdf",
        "47_htsflow.pdf"
      ],
      "expected_answer": "Salmon improves transcript quantification by using dual-phase inference and correcting for fragment GC content bias, which reduces false positives in differential expression (DE). DESeq2 enhances DE analysis by using Empirical Bayes shrinkage for dispersion and fold change estimates, which stabilizes results for low-count genes. The HTS-flow platform automates these secondary analysis steps by providing modular scripts and a GUI, allowing researchers to consistently apply these tools to large-scale RNA-seq datasets while ensuring metadata-driven reproducibility.",
      "expected_concepts": [
        "GC content bias",
        "Quasi-mapping",
        "Empirical Bayes",
        "Shrinkage",
        "HTS-flow",
        "Secondary analysis"
      ],
      "difficulty": "medium",
      "answerable": true
    },
    {
      "id": "SY_001",
      "question": "Based on the development of Sarek, Snakemake, nf-core, and fastp, what is the emerging meta-strategy for balancing high-performance computational efficiency with scientific reproducibility in large-scale genomics?",
      "category": "synthesis",
      "subcategory": "meta_conclusion",
      "min_core": 5,
      "source_files": [
        "01_sarek.pdf",
        "02_snakemake.pdf",
        "03_nfcore_framework.pdf",
        "04_fastp.pdf"
      ],
      "expected_answer": "The emerging meta-strategy involves a tiered approach that decouples low-level computational optimization from high-level workflow logic. At the tool level, software like fastp prioritizes 'all-in-one' single-scan processing in C++ to minimize I/O overhead and improve raw speed. At the framework level, the nf-core and Sarek pipelines utilize Nextflow's DSL2 to enable modularity and parallelization strategies like FASTQ sharding, while simultaneously reducing storage costs through CRAM adoption. This is governed by workflow management systems like Snakemake and Nextflow, which ensure sustainability and reproducibility through Domain Specific Languages (DSLs), containerization (Docker/Singularity), and cryptographic hashing of results. Collectively, these papers demonstrate that modern bioinformatics achieves efficiency through tool-specific optimization (fastp) and workflow-specific resource scheduling (Snakemake/nf-core), while maintaining reproducibility through standardized execution environments.",
      "expected_concepts": [
        "DSL (Domain Specific Language)",
        "Single-scan processing",
        "Containerization",
        "CRAM conversion",
        "I/O overhead",
        "Parallelization/Sharding",
        "Sustainability"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_002",
      "question": "How do the collective methodologies across Sarek, Snakemake, nf-core, and MultiQC redefine the concept of 'quality control' from a single-sample metric to a system-wide diagnostic process?",
      "category": "synthesis",
      "subcategory": "theme_analysis",
      "min_core": 5,
      "source_files": [
        "01_sarek.pdf",
        "02_snakemake.pdf",
        "03_nfcore_framework.pdf",
        "05_multiqc.pdf"
      ],
      "expected_answer": "The papers collectively shift quality control (QC) from an isolated post-hoc check to an integrated, multi-layered diagnostic system. Sarek and nf-core embed QC as a foundational component of the automated pipeline, ensuring that best-practice tools like GATK and BQSR are applied consistently across all samples. Snakemake adds a layer of 'sustainable' QC by using SHA-256 hashing to ensure data provenance and reproducibility of results across different execution environments. MultiQC provides the final integration layer, synthesizing modular outputs from across these workflows into a unified visual report that allows researchers to identify global trends and batch effects. This synthesis reveals that modern QC is not just about measuring read quality, but about verifying the integrity of the execution logic (Snakemake), the standardization of the pipeline (Sarek/nf-core), and the statistical consistency of the entire sample cohort (MultiQC).",
      "expected_concepts": [
        "Batch effects",
        "Data provenance",
        "SHA-256 hashing",
        "Best-practice recommendations",
        "Global trends",
        "Modular architecture",
        "System-wide diagnostics"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_003",
      "question": "How does the corpus demonstrate an evolution from localized tool optimization to systemic computational efficiency across different layers of the bioinformatics stack?",
      "category": "synthesis",
      "subcategory": "trend_identification",
      "min_core": 10,
      "source_files": [
        "03_nfcore_framework.pdf",
        "04_fastp.pdf",
        "06_star_aligner.pdf",
        "09_seqkit.pdf"
      ],
      "expected_answer": "The corpus illustrates a transition from isolated algorithmic improvements to a tiered, systemic approach to computational efficiency. Localized optimizations are seen in tools like STAR and fastp, which utilize memory-intensive data structures like Suffix Arrays and single-scan I/O strategies in C++ to maximize individual CPU throughput. SeqKit advances this by leveraging modern language features like Go's Goroutines for high-performance parallel parsing across platforms. At the highest level, frameworks like nf-core shift the focus to infrastructure-wide efficiency, employing intra-sample sharding and containerization to achieve massive scalability and cost reduction in cloud environments. Collectively, these papers suggest that modern bioinformatics performance is no longer just an algorithmic challenge but a synergy between low-level data handling, language-specific concurrency, and high-level resource orchestration.",
      "expected_concepts": [
        "Suffix Arrays",
        "single-scan processing",
        "Goroutines",
        "intra-sample parallelization",
        "DSL2",
        "scalability",
        "CRAM format"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_004",
      "question": "Based on the various stages of data processing described, what meta-conclusion can be drawn regarding the relationship between technical artifact removal and statistical reliability in high-throughput sequencing analysis?",
      "category": "synthesis",
      "subcategory": "meta_conclusion",
      "min_core": 10,
      "source_files": [
        "04_fastp.pdf",
        "05_multiqc.pdf",
        "07_salmon.pdf",
        "08_deseq2.pdf",
        "10_cutadapt.pdf"
      ],
      "expected_answer": "A synthesis of these papers reveals that biological signal reliability is not the product of a single 'cleanup' step, but depends on a hierarchical, multi-stage intervention for technical artifacts and biases. Initial layers like fastp and Cutadapt provide essential physical cleanup by removing adapter sequences and low-quality bases. However, Salmon demonstrates that physical cleaning is insufficient, requiring sophisticated statistical models to correct for inherent fragment GC content and sequence-specific biases during quantification. MultiQC adds a holistic layer of quality control by identifying global batch effects and outliers that might be invisible at the single-sample level. Finally, DESeq2\u2019s use of Empirical Bayes shrinkage for dispersion estimation proves that robust statistical inference must explicitly model the persistent noise and variance that survive even the most rigorous pre-processing pipelines.",
      "expected_concepts": [
        "Fragment GC content bias",
        "Empirical Bayes shrinkage",
        "adapter trimming",
        "batch effects",
        "dispersion estimation",
        "quality control",
        "Logarithmic Fold Change (LFC)"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_005",
      "question": "How do human-animal interface factors, ranging from professional habits to physiological events in livestock, collectively define the zoonotic risk landscape for bacterial and viral pathogens in endemic regions?",
      "category": "synthesis",
      "subcategory": "meta_conclusion",
      "min_core": 20,
      "source_files": [
        "14_bovine_tb_cameroon.pdf",
        "17_brucellosis_ethiopia.pdf",
        "19_hpai_canada.pdf"
      ],
      "expected_answer": "Zoonotic risk at the human-animal interface is shaped by a combination of professional exposure, high-risk animal life-cycle events, and viral evolution. Evidence from Cameroon and Ethiopia indicates that occupations like butchering and livestock farming are primary drivers for bacterial zoonoses such as Bovine TB and Brucellosis, often exacerbated by a lack of awareness regarding transmission. Furthermore, physiological conditions in animals, such as history of retained fetal membranes (RFM) in cattle and camels, serve as significant indicators of high environmental contamination risk. This situational risk is further compounded by viral adaptation, where mutations in the H5N1 virus (e.g., PB2-E627K) facilitate spillover from wildlife into mammalian species, leading to severe neurological outcomes. Collectively, these findings suggest that zoonotic prevention must integrate sociodemographic, physiological, and genomic data to accurately map and mitigate spillover threats.",
      "expected_concepts": [
        "One Health",
        "Zoonotic spillover",
        "Retained Fetal Membranes (RFM)",
        "Mammalian adaptation mutations",
        "Bovine Tuberculosis",
        "Professional risk factors"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_006",
      "question": "Analyze how the choice of diagnostic methodology and specific assay thresholds influences the reported prevalence and epidemiological understanding of transboundary diseases.",
      "category": "synthesis",
      "subcategory": "methodology_comparison",
      "min_core": 20,
      "source_files": [
        "11_asf_burkina_faso.pdf",
        "14_bovine_tb_cameroon.pdf",
        "16_ppr_ethiopia.pdf",
        "17_brucellosis_ethiopia.pdf"
      ],
      "expected_answer": "Comparing diagnostic strategies across ASF, PPR, TB, and Brucellosis reveals that disease burden estimates are highly sensitive to the chosen methodology and assay thresholds. In Bovine TB studies, prevalence varied significantly based on whether the CIDT or SITT test was used and which specific PPD threshold was applied to interpret reactivity. Similarly, in PPR surveillance, seroprevalence interpretation is complicated by the animal's vaccination status and the specific 'Sample Negativity' (S/N%) cutoff used in c-ELISA tests. While some studies rely on molecular confirmation (PCR) to identify active viral genotypes (like ASFV Genotype I), others use sequential serological screening (RBPT followed by cELISA) to refine exposure estimates. This synthesis suggests that the perceived epidemiology of these diseases is not an absolute value but a variable heavily dependent on the standardization of diagnostic benchmarks and the technological approach of the region.",
      "expected_concepts": [
        "c-ELISA",
        "CIDT (Comparative Intradermal Tuberculin Test)",
        "Seroprevalence",
        "Diagnostic threshold",
        "Sample Negativity percentage (S/N%)",
        "Molecular characterization"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_007",
      "question": "How do anthropogenic factors, such as animal management and vaccination distribution, create patterns of disease persistence in regions attempting control of transboundary pathogens?",
      "category": "synthesis",
      "subcategory": "theme_analysis",
      "min_core": 20,
      "source_files": [
        "13_lsd_nepal.pdf",
        "15_rabies_tanzania.pdf",
        "16_ppr_ethiopia.pdf",
        "20_lsd_review.pdf"
      ],
      "expected_answer": "Anthropogenic drivers, specifically animal trade and uneven management practices, are critical for the persistence of transboundary animal diseases. Research on PPR in Ethiopia and LSD in Nepal highlights that the movement of 'gifted' or 'purchased' animals and the mixing of dry cattle or heifers are significant risk factors for introducing infection into previously healthy flocks. Furthermore, the effectiveness of mass vaccination is often undermined by spatial heterogeneity; in the case of Rabies, focal incidence remains high when vaccination coverage is uneven across neighboring villages, even if total coverage targets are nearly met. The persistence of these diseases is also linked to the 'transboundary' nature of the pathogens, where incursions from outside a managed district can re-seed infection. Thus, the common theme across these diverse viral threats is that management consistency and movement control are as vital as the biological efficacy of the vaccines used.",
      "expected_concepts": [
        "Spatial heterogeneity",
        "Animal movement",
        "Herd immunity",
        "Focal incidence",
        "Transboundary disease",
        "Rolling coverage"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_008",
      "question": "What meta-conclusion can be drawn regarding the evolving clinical presentation of livestock and wildlife viral diseases and its impact on traditional early-warning surveillance systems?",
      "category": "synthesis",
      "subcategory": "trend_identification",
      "min_core": 20,
      "source_files": [
        "12_hpai_netherlands.pdf",
        "18_fmd_review.pdf",
        "19_hpai_canada.pdf",
        "20_lsd_review.pdf"
      ],
      "expected_answer": "A synthesis of clinical data across HPAI, FMD, and LSD suggests a global trend where traditional diagnostic paradigms are being challenged by evolving, non-specific manifestations. HPAI H5N1 outbreaks have shown a significant shift toward neurological and locomotor signs in both poultry and mammalian spillover cases, which may be missed by surveillance systems focused only on high mortality or respiratory distress. Similarly, research into FMD identifies neoteric subclinical infections and 'carrier states' that allow for preclinical transmission, complicating the traditional 'symptom-first' detection model. LSD also presents a challenge with its high morbidity but low mortality rates, which can lead to delayed reporting in areas where only severe fatalities trigger action. Collectively, these papers indicate that future early-warning systems must move beyond mortality-centric metrics to incorporate multi-organ clinical assessments and subclinical screening to effectively manage emerging viral variants.",
      "expected_concepts": [
        "Neoteric subclinical infection",
        "Neurological signs",
        "Carrier state",
        "Pathogenesis",
        "Clade 2.3.4.4b",
        "Subclinical transmission"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_009",
      "question": "Identify the gaps in translating standardized, large-scale genomic workflows into the context of rapid, localized veterinary outbreak characterization based on current literature.",
      "category": "synthesis",
      "subcategory": "gap_analysis",
      "min_core": 20,
      "source_files": [
        "01_sarek.pdf",
        "03_nfcore_framework.pdf",
        "11_asf_burkina_faso.pdf",
        "19_hpai_canada.pdf"
      ],
      "expected_answer": "While frameworks like Sarek and nf-core provide highly optimized, reproducible pipelines for variant discovery in large-scale datasets, a gap remains in their deployment for localized, real-time veterinary surveillance. These pipelines are designed for high-performance computing (HPC) environments to handle WGS/WES data, yet veterinary characterization of ASF or HPAI often requires rapid, possibly field-based, molecular identification of specific gene regions (like the ASFV p72 or p54 genes) rather than full-scale genome reprocessing. Additionally, although standardized workflows reduce costs and improve portability, they do not inherently integrate the localized environmental and migratory data\u2014such as flyways for HPAI or village-level trade routes for ASF\u2014necessary for comprehensive outbreak investigation. This suggests that the next generation of bioinformatics tools must bridge the gap between 'best-practice' scalability and the immediate, context-specific needs of wildlife and livestock disease management.",
      "expected_concepts": [
        "Reproducibility",
        "DSL2 (Domain Specific Language)",
        "Molecular characterization",
        "WGS (Whole Genome Sequencing)",
        "Outbreak investigation",
        "Scalability"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_010",
      "question": "How do high-performance sequence manipulation tools and modular workflow management systems collectively address the technical challenges of modern genomic surveillance for transboundary pathogens?",
      "category": "synthesis",
      "subcategory": "theme_analysis",
      "min_core": 20,
      "source_files": [
        "02_snakemake.pdf",
        "05_multiqc.pdf",
        "09_seqkit.pdf",
        "11_asf_burkina_faso.pdf"
      ],
      "expected_answer": "Genomic surveillance of pathogens like ASFV requires a combination of speed, transparency, and comprehensive reporting. High-performance tools like SeqKit facilitate the rapid parsing and manipulation of sequencing data, overcoming bottlenecks in FASTA/Q processing that are critical during active outbreaks. These tools are integrated into 'sustainable' workflow managers like Snakemake, which ensure reproducibility through Directed Acyclic Graphs (DAGs) and result caching via blockchain-style hashing. To handle the complexity of large-scale analysis, reporting tools like MultiQC aggregate data from various bioinformatics modules into a single report, allowing for the rapid identification of batch effects or quality outliers across multiple samples. Together, this technology stack allows researchers to move from raw data to the molecular characterization of specific genotypes (e.g., ASFV Genotype I) with the speed and rigor required for global biosafety.",
      "expected_concepts": [
        "Directed Acyclic Graph (DAG)",
        "Goroutines",
        "Batch effects",
        "Reproducibility",
        "SHA-256 hashing",
        "Molecular characterization"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_011",
      "question": "How do modern workflow management systems and graphical interfaces collectively address the 'reproducibility crisis' in high-throughput sequencing analysis across different computational environments?",
      "category": "synthesis",
      "subcategory": "theme_analysis",
      "min_core": 50,
      "source_files": [
        "01_sarek.pdf",
        "02_snakemake.pdf",
        "03_nfcore_framework.pdf",
        "47_htsflow.pdf"
      ],
      "expected_answer": "The transition toward reproducible bioinformatics is achieved through a multi-layered approach involving domain-specific languages (DSLs), containerization, and integrated metadata management. Sarek and nf-core utilize Nextflow's DSL to ensure portability across clouds and clusters, while Snakemake employs a Python-based DSL and blockchain-style hashing to guarantee that results are traceable and cacheable. Complementing these command-line tools, HTS-flow integrates with Laboratory Information Management Systems (LIMS) to automate the documentation of primary and secondary analysis steps, ensuring that even users without deep IT expertise can generate reproducible results. Collectively, these systems replace brittle, manual scripts with version-controlled, containerized modules (Docker/Singularity) that abstract the underlying hardware, allowing for consistent execution from local nodes to massive cloud infrastructures like AWS Batch.",
      "expected_concepts": [
        "Nextflow DSL2",
        "Containerization (Docker/Singularity)",
        "Blockchain-style hashing",
        "LIMS integration",
        "Portability",
        "Directed Acyclic Graph (DAG)"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_012",
      "question": "Compare the strategies used by high-performance bioinformatics toolkits to overcome I/O and CPU bottlenecks when processing massive genomic and epigenomic datasets.",
      "category": "synthesis",
      "subcategory": "methodology_comparison",
      "min_core": 50,
      "source_files": [
        "04_fastp.pdf",
        "09_seqkit.pdf",
        "10_cutadapt.pdf",
        "46_methylpipe.pdf"
      ],
      "expected_answer": "Bioinformatics tools employ distinct programming paradigms and optimization techniques to handle the data-heavy nature of NGS. fastp (C++) and SeqKit (Go) prioritize multi-threading and efficient memory management; fastp uses a thread pool and single-scan processing to minimize I/O overhead, while SeqKit leverages Go\u2019s goroutines and ASCII code indexing for rapid parsing. Cutadapt, while utilizing Python for ease of use, implements performance-critical alignment logic in C extensions to maintain speed during adapter trimming. For epigenomic data, methylPipe utilizes Tabix indexing for fast random access to large compressed files, demonstrating that high-level languages like R can remain performant when paired with optimized data structures. Together, these tools show a trend toward moving away from multi-tool shell pipelines toward integrated, multi-threaded binaries that reduce the time spent on data reading and writing.",
      "expected_concepts": [
        "Single-scan processing",
        "Goroutines",
        "C extensions",
        "Tabix indexing",
        "Multi-threading",
        "I/O overhead"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_013",
      "question": "Analyze the shift from Operational Taxonomic Units (OTUs) to Amplicon Sequence Variants (ASVs) across diverse ecological studies and explain why this shift is critical for cross-study meta-analyses.",
      "category": "synthesis",
      "subcategory": "trend_identification",
      "min_core": 50,
      "source_files": [
        "23_swine_cultivation.pdf",
        "28_otu_vs_asv.pdf",
        "29_16s_best_practices.pdf",
        "30_aquaculture_metagenomics.pdf"
      ],
      "expected_answer": "The transition from OTU clustering to ASV-based denoising represents a paradigm shift toward higher resolution and better interoperability in microbiome research. Studies in wastewater (28) and aquaculture (30) demonstrate that while OTU clustering at 97% similarity often inflates diversity by grouping disparate sequences, ASVs provide a consistent biological unit that is not dependent on a specific dataset or clustering threshold. This allows researchers to compare findings across independent studies\u2014a core requirement for the 'best practices' outlined in animal science (29). Furthermore, in swine cultivation research (23), ASVs enabled the precise identification of taxa that were both culturable and detectable via sequencing, which would be obscured by the coarser resolution of OTUs. Ultimately, ASVs facilitate the creation of global databases and meta-analyses by providing a stable, sequence-based identifier that reflects true biological variation rather than sequencing artifacts.",
      "expected_concepts": [
        "Denoising (DADA2)",
        "97% similarity threshold",
        "Cross-study comparability",
        "Sequence-based identifiers",
        "Biological variation",
        "Resolution"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_014",
      "question": "What meta-conclusions can be drawn regarding the structural and informational requirements for moving One Health surveillance from theoretical frameworks to functional digital implementations?",
      "category": "synthesis",
      "subcategory": "meta_conclusion",
      "min_core": 50,
      "source_files": [
        "34_one_health_framework.pdf",
        "35_isse_framework_amr.pdf",
        "36_integrated_amr_surveillance.pdf",
        "45_kenya_amr_digital.pdf"
      ],
      "expected_answer": "Moving One Health surveillance from theory to practice requires overcoming significant fragmentation in data standards and institutional coordination. While general frameworks like GOHF (34) provide a roadmap for multisectoral collaboration, the evaluation of actual systems reveals a lack of consensus on what 'integrated surveillance' truly entails (35). A scoping review shows that most existing integrated systems are confined to high-income countries, suggesting a 'digital divide' in global health security (36). However, the implementation of OHAMRS in Kenya (45) demonstrates that using interoperable middleware (Open Interop) and standardized visualization tools (DHIS2) can successfully bridge the gap between human and animal health data. Therefore, a functional digital One Health system must prioritize informational integration\u2014the ability to harmonize data across sectors\u2014alongside the structural integration of policy and personnel.",
      "expected_concepts": [
        "Informational Integration",
        "Interoperability",
        "ISSE framework",
        "OHAMRS",
        "Multisectoral collaboration",
        "Data silos"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_015",
      "question": "How do spatial and host-species heterogeneities influence the persistence of viral pathogens, and what are the implications for designing sentinel surveillance systems?",
      "category": "synthesis",
      "subcategory": "theme_analysis",
      "min_core": 50,
      "source_files": [
        "12_hpai_netherlands.pdf",
        "15_rabies_tanzania.pdf",
        "19_hpai_canada.pdf",
        "37_data_driven_one_health.pdf"
      ],
      "expected_answer": "The persistence of viral pathogens is driven by uneven distribution across landscapes and species, necessitating a targeted, data-driven approach to surveillance. In rabies management, spatial heterogeneity in vaccination coverage\u2014rather than just the overall coverage level\u2014is a primary driver of disease focal points and persistence (15). Similarly, the shift in HPAI outbreaks toward different poultry types and wild birds (12, 19) shows that 'spillover' and 'spillback' events between species create complex reservoirs that standard surveillance may miss. Integrating these insights with machine learning and sentinel surveillance (37) allows for the prediction of high-risk 'hotspots' and the detection of mammalian-adaptation mutations (like PB2-E627K). Thus, surveillance must move beyond broad monitoring toward high-resolution, multi-species tracking that accounts for gaps in local immunity and environmental reservoirs.",
      "expected_concepts": [
        "Spatial heterogeneity",
        "Spillover/Spillback",
        "Sentinel surveillance",
        "Mammalian adaptation",
        "Focal incidence",
        "Machine learning"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_016",
      "question": "Evaluate how the integration of metagenomics, culturomics, and metabolomics is advancing our understanding of the 'functional' gut microbiome beyond simple taxonomic profiling.",
      "category": "synthesis",
      "subcategory": "trend_identification",
      "min_core": 50,
      "source_files": [
        "22_pig_mags.pdf",
        "23_swine_cultivation.pdf",
        "24_chicken_microbiome.pdf",
        "26_gut_meat_quality.pdf"
      ],
      "expected_answer": "The field is moving from 'who is there' to 'what they are doing' by combining multiple omics layers to link microbial activity to host health. Metagenome-assembled genomes (MAGs) provide the genomic blueprint for uncultured species, revealing their potential to produce short-chain fatty acids (SCFAs) (22). Culturomics complements this by isolating previously 'unculturable' bacteria, allowing for the experimental verification of these metabolic roles (23). In poultry, absolute quantitative profiling (QMP) paired with serum metabolomics has shown that the microbiome's influence is mediated through specific host metabolites (24). These functional insights directly translate to livestock production, as specific bacterial genera and their fermentation products (SCFAs) have been shown to regulate intramuscular fat deposition and overall meat quality (26). This synthesis reveals that only by integrating these tools can we establish the causal pathways between microbial metabolism and livestock productivity.",
      "expected_concepts": [
        "Metagenome-assembled genomes (MAGs)",
        "Culturomics",
        "Quantitative Microbiome Profiling (QMP)",
        "Short-chain fatty acids (SCFAs)",
        "Host-microbe-metabolite axis",
        "CAZymes"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_017",
      "question": "Assess the role of Whole Genome Sequencing (WGS) in replacing or augmenting phenotypic testing for international antimicrobial resistance (AMR) monitoring.",
      "category": "synthesis",
      "subcategory": "meta_conclusion",
      "min_core": 50,
      "source_files": [
        "41_salmonella_wgs_amr.pdf",
        "42_campylobacter_wgs_thailand.pdf",
        "43_esbl_ecoli_food.pdf"
      ],
      "expected_answer": "WGS is rapidly becoming a critical tool for AMR monitoring due to its ability to identify the precise genetic mechanisms underlying phenotypic resistance across multiple bacterial species. Case studies on Salmonella in England (41) and Campylobacter in Thailand (42) show that WGS provides essential epidemiological context, such as identifying specific sequence types (STs) and point mutations (SNPs in QRDR) that correlate with high-level fluoroquinolone resistance. WGS also reveals the genomic basis of ESBL production in E. coli from food animals, where plasmid analysis shows that resistance backbones can be nearly identical between animal and human isolates, with shared sequence types like ST10 and ST131 (43). While phenotypic testing measures the 'what' (minimum inhibitory concentration), WGS explains the 'how' and 'where' (e.g., plasmid-mediated transfer between chicken and human isolates). The integration of these techniques across Salmonella, Campylobacter, and E. coli allows for the detection of emerging resistance trends and cross-species transmission pathways long before they become widespread clinical problems.",
      "expected_concepts": [
        "Sequence Typing (ST)",
        "SNPs (Single Nucleotide Polymorphisms)",
        "Plasmid-mediated resistance",
        "QRDR mutations",
        "ESBL-producing E. coli",
        "WGS-based monitoring"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_018",
      "question": "Identify the critical gaps in current bioinformatics pipelines and experimental designs regarding the integration of non-bacterial domains and epigenetic data into microbial network models.",
      "category": "synthesis",
      "subcategory": "gap_analysis",
      "min_core": 50,
      "source_files": [
        "21_livestock_gut_microbiome_review.pdf",
        "27_inap_pipeline.pdf",
        "30_aquaculture_metagenomics.pdf",
        "46_methylpipe.pdf"
      ],
      "expected_answer": "Despite advances in sequencing, significant gaps remain in our ability to integrate diverse -omics data types into a unified ecological view. A major limitation identified in livestock research is the 'bacteriocentric' bias, where fungi (mycobiome) and viruses are frequently neglected due to a lack of specialized primers and reference databases (21, 30). While pipelines like iNAP (27) allow for interdomain network analysis, they are limited to datasets with fewer than 1,000 species/OTUs for online processing, and their correlation-based methods (such as SparCC) have not been validated for the much larger dimensionality of integrated cross-domain datasets. Furthermore, epigenomic data (e.g., DNA methylation) is almost entirely absent from standard microbial network models, despite its role in host-environment interactions. Current tools like methylPipe (46) are designed for specialized epigenomic analysis rather than integration with microbial ASVs, highlighting a need for new bioinformatics frameworks that can co-analyze microbial composition with host epigenetic and multi-domain (fungal/viral) data.",
      "expected_concepts": [
        "Bacteriocentric bias",
        "Interdomain Ecological Network Analysis",
        "High dimensionality",
        "Sparse correlations",
        "DNA methylation",
        "Non-bacterial microbes"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_019",
      "question": "How do anthropogenic factors and agricultural industrialization facilitate the environmental spread of antimicrobial resistance (AMR), and what does this imply for future mitigation strategies?",
      "category": "synthesis",
      "subcategory": "theme_analysis",
      "min_core": 50,
      "source_files": [
        "31_amr_one_health.pdf",
        "33_amr_food_animals.pdf",
        "40_animal_agriculture_one_health.pdf"
      ],
      "expected_answer": "The environmental spread of AMR is exacerbated by the sheer volume of antibiotics used in industrial agriculture, which often exceeds human medical use (31, 40). Mitigation is complicated by the fact that up to 75% of these antibiotics are excreted unmetabolized, turning manure and wastewater into environmental reservoirs (33). Furthermore, industrial practices like metaphylactic use and the presence of bioaerosols allow resistant bacteria to disperse up to 10 km from farm sites, contaminating surrounding ecosystems and human food chains (33). This implies that future mitigation cannot focus solely on human clinical settings; it must incorporate agricultural reforms, such as the use of biochar for waste treatment and the development of phage-based alternatives to traditional antibiotics. The synthesis suggests that without addressing the environmental 'leaks' from industrial farming, the spread of mobile genetic elements (MGEs) will continue to outpace the development of new human drugs.",
      "expected_concepts": [
        "Metaphylactic use",
        "Bioaerosols",
        "Excreted antibiotics",
        "Mobile Genetic Elements (MGEs)",
        "Environmental reservoirs",
        "Growth promotion"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "SY_020",
      "question": "What collective evidence exists regarding the importance of 'viewpoint' and 'location' in the design of next-generation sequencing experiments, from chromosome conformation to rumen sampling?",
      "category": "synthesis",
      "subcategory": "meta_conclusion",
      "min_core": 50,
      "source_files": [
        "21_livestock_gut_microbiome_review.pdf",
        "25_dairy_cow_feed.pdf",
        "29_16s_best_practices.pdf",
        "50_4cseq_protocol.pdf"
      ],
      "expected_answer": "Across various genomic disciplines, the specific 'viewpoint' or spatial location of sampling is a critical determinant of biological signal. In 4C-seq, the experiment is literally designed around a specific genomic 'viewpoint' to capture cis- and trans-interactions (50). This spatial specificity is mirrored in microbiome research, where the choice of sampling site (e.g., rumen vs. feces) significantly impacts the observed community and its correlation with host traits like feed efficiency (21, 29). For instance, specific rumen-associated microbes are linked to feed efficiency traits via pQTLs, whereas these associations might be lost if sampling only fecal matter (25). Collectively, these papers demonstrate that experimental design must prioritize biological relevance over convenience; whether mapping chromatin or the gut, the 'location' defines the ecological or structural interactions that can be discovered, requiring precise standardization for cross-study reproducibility.",
      "expected_concepts": [
        "Viewpoint (VP)",
        "Cis-interactions",
        "Spatial sampling bias",
        "Rumen-associated microbes",
        "pQTL",
        "Metadata standards"
      ],
      "difficulty": "hard",
      "answerable": true
    },
    {
      "id": "OOD_001",
      "question": "What is the chemical formula for water?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_002",
      "question": "Who wrote the novel 'One Hundred Years of Solitude'?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_003",
      "question": "What is the speed of light in a vacuum?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_004",
      "question": "In what year did the Berlin Wall fall?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_005",
      "question": "What is the largest planet in our solar system?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_006",
      "question": "What is the Pythagorean theorem?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_007",
      "question": "What is the boiling point of ethanol at standard atmospheric pressure?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_008",
      "question": "Who proposed the theory of general relativity?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_009",
      "question": "What are the main causes of the French Revolution?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_010",
      "question": "What programming language was originally developed by Guido van Rossum?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_011",
      "question": "How many chromosomes do humans have?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_012",
      "question": "What is the capital of Japan?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_013",
      "question": "What is the primary function of mitochondria in eukaryotic cells?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_014",
      "question": "What are Newton's three laws of motion?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_015",
      "question": "What is the molecular weight of glucose?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_016",
      "question": "Who painted the ceiling of the Sistine Chapel?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_017",
      "question": "What is the half-life of carbon-14?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_018",
      "question": "What is the difference between TCP and UDP protocols?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_019",
      "question": "What are the noble gases in the periodic table?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_020",
      "question": "What is the distance from the Earth to the Moon?",
      "category": "out_of_domain",
      "subcategory": "general_knowledge",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_021",
      "question": "What spectral classification system is used to categorize main-sequence stars, and how does surface temperature relate to stellar luminosity on the Hertzsprung-Russell diagram?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_022",
      "question": "How does the subduction of oceanic plates at convergent boundaries lead to the formation of volcanic arcs and deep-sea trenches?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_023",
      "question": "What is the role of the Higgs boson in the Standard Model of particle physics, and how was it experimentally confirmed at the Large Hadron Collider?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_024",
      "question": "What are the key differences between Keynesian and monetarist approaches to managing economic recessions?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_025",
      "question": "How do superconducting materials achieve zero electrical resistance, and what is the BCS theory of superconductivity?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_026",
      "question": "What is the Riemann hypothesis, and why is it considered one of the most important unsolved problems in pure mathematics?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_027",
      "question": "How does the Coriolis effect influence large-scale atmospheric circulation patterns and the formation of cyclones?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_028",
      "question": "What are the main principles of Chomsky's theory of Universal Grammar, and how has it influenced modern computational linguistics?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_029",
      "question": "What archaeological evidence supports the theory that ancient Polynesians navigated the Pacific Ocean using celestial observations and ocean swells?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_030",
      "question": "How do semiconductor doping processes create p-type and n-type materials for use in transistor fabrication?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_031",
      "question": "What are the psychological mechanisms behind confirmation bias, and how do they affect decision-making in clinical settings?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_032",
      "question": "How does the carbon cycle operate in deep-sea hydrothermal vent ecosystems, and what chemosynthetic organisms dominate these environments?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_033",
      "question": "What are the legal principles governing the doctrine of stare decisis in common law jurisdictions?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_034",
      "question": "How does crop rotation with leguminous plants improve soil nitrogen content through biological nitrogen fixation?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_035",
      "question": "What role does the photoperiod play in regulating flowering time in Arabidopsis thaliana, and which genes are involved?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_036",
      "question": "How do tidal forces from the Moon cause ocean tides, and what determines the difference between spring and neap tides?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_037",
      "question": "What are the primary differences between Gothic and Romanesque architectural styles in medieval European cathedrals?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_038",
      "question": "What is the mechanism of action of perovskite solar cells, and how does their efficiency compare to traditional silicon photovoltaics?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_039",
      "question": "How does the Monte Carlo tree search algorithm work in game-playing AI systems like AlphaGo?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_040",
      "question": "What are the harmonic principles underlying the Circle of Fifths in Western music theory?",
      "category": "out_of_domain",
      "subcategory": "excluded_topic",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_041",
      "question": "According to the Sarek paper, the pipeline only supports Illumina short-read sequencing data and cannot process any other data types. Is this correct?",
      "category": "out_of_domain",
      "subcategory": "contradictory",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_042",
      "question": "The fastp paper states that fastp requires separate tools for adapter trimming and quality filtering, processing data in multiple sequential passes. Is this accurate?",
      "category": "out_of_domain",
      "subcategory": "contradictory",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_043",
      "question": "According to the DESeq2 paper, the method uses raw read counts without any normalization or variance stabilization. Is this true?",
      "category": "out_of_domain",
      "subcategory": "contradictory",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_044",
      "question": "The Snakemake paper describes a system that uses a simple first-come-first-served job queue without any optimization of task scheduling. Is this correct?",
      "category": "out_of_domain",
      "subcategory": "contradictory",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_045",
      "question": "The nf-core framework paper reports that switching from BAM to CRAM format increased storage requirements by 65%. Is this finding correct?",
      "category": "out_of_domain",
      "subcategory": "contradictory",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_046",
      "question": "According to the Salmon paper, transcript quantification is performed using traditional full alignment of reads to the reference genome. Is this accurate?",
      "category": "out_of_domain",
      "subcategory": "contradictory",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_047",
      "question": "The LSD Nepal paper reports that pregnant cows had a significantly lower risk of LSD infection compared to non-pregnant cattle. Is this correct?",
      "category": "out_of_domain",
      "subcategory": "contradictory",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_048",
      "question": "The HPAI Netherlands paper found that clinical signs of avian influenza are identical across all poultry species, with no species-dependent differences. Is this true?",
      "category": "out_of_domain",
      "subcategory": "contradictory",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_049",
      "question": "According to the MultiQC paper, the tool can only process output from a single bioinformatics tool at a time and does not support aggregation across tools. Is this correct?",
      "category": "out_of_domain",
      "subcategory": "contradictory",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    },
    {
      "id": "OOD_050",
      "question": "The rabies Tanzania paper concludes that vaccination campaigns in the Serengeti achieved uniform coverage across all areas, completely eliminating spatial heterogeneity. Is this accurate?",
      "category": "out_of_domain",
      "subcategory": "contradictory",
      "min_core": null,
      "source_files": [],
      "expected_answer": "NOT_IN_CORPUS",
      "expected_concepts": [],
      "difficulty": null,
      "answerable": false
    }
  ]
}