[
    {
        "title": "MiniCPM: Unveiling the Potential of Small Language Models with Scalable\n  Training Strategies",
        "doi": "10.48550/arxiv.2404.06395",
        "abstract": "The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameters has been met with concerns regarding resource efficiency and practical expense, particularly given the immense cost of experimentation. This scenario underscores the importance of exploring the potential of Small Language Models (SLMs) as a resource-efficient alternative. In this context, we introduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter variants, not only excel in their respective categories but also demonstrate capabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach exhibits scalability in both model and data dimensions for future LLM research. Regarding model scaling, we employ extensive model wind tunnel experiments for stable and optimal scaling. For data scaling, we introduce a Warmup-Stable-Decay (WSD) learning rate scheduler (LRS), conducive to continuous training and domain adaptation. We present an in-depth analysis of the intriguing training dynamics that occurred in the WSD LRS. With WSD LRS, we are now able to efficiently study data-model scaling law without extensive retraining experiments on both axes of model and data, from which we derive the much higher compute optimal data-model ratio than Chinchilla Optimal. Additionally, we introduce MiniCPM family, including MiniCPM-DPO, MiniCPM-MoE and MiniCPM-128K, whose excellent performance further cementing MiniCPM's foundation in diverse SLM applications. MiniCPM models are available publicly at https://github.com/OpenBMB/MiniCPM .",
        "orkg_categories_flat": [
            "generic",
            "small language models (slms) survey"
        ],
        "papers_with_code_categories_flat": [
            "chinchilla",
            "domain adaptation",
            "language models",
            "natural language processing"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "computer science",
            "database",
            "geography",
            "meteorology",
            "natural language processing",
            "natural language processing techniques",
            "scalability",
            "topic modeling",
            "training (meteorology)"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "OLMo: Accelerating the Science of Language Models",
        "doi": "10.18653/v1/2024.acl-long.841",
        "abstract": "Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings. As their commercial importance has surged, the most powerful models have become closed off, gated behind proprietary interfaces, with important details of their training data, architectures, and development undisclosed. Given the importance of these details in scientifically studying these models, including their biases and potential risks, we believe it is essential for the research community to have access to powerful, truly open LMs. To this end, we have built OLMo, a competitive, truly Open Language Model, to enable the scientific study of language models. Unlike most prior efforts that have only released model weights and inference code, we release OLMo alongside open training data and training and evaluation code. We hope this release will empower the open research community and inspire a new wave of innovation.",
        "orkg_categories_flat": [
            "generic",
            "small language models (slms) survey"
        ],
        "papers_with_code_categories_flat": [
            "language modeling",
            "language modelling"
        ],
        "openalex_categories_flat": [
            "computer science",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Enhancing text-based knowledge graph completion with zero-shot large language models: A focus on semantic enhancement",
        "doi": "10.1016/j.knosys.2024.112155",
        "abstract": "The design and development of text-based knowledge graph completion (KGC) methods leveraging textual entity descriptions are at the forefront of research. These methods involve advanced optimization techniques such as soft prompts and contrastive learning to enhance KGC models. The effectiveness of text-based methods largely hinges on the quality and richness of the training data. Large language models (LLMs) can utilize straightforward prompts to alter text data, thereby enabling data augmentation for KGC. Nevertheless, LLMs typically demand substantial computational resources. To address these issues, we introduce a framework termed constrained prompts for KGC (CP-KGC). This CP-KGC framework designs prompts that adapt to different datasets to enhance semantic richness. Additionally, CP-KGC employs a context constraint strategy to effectively identify polysemous entities within KGC datasets. Through extensive experimentation, we have verified the effectiveness of this framework. Even after quantization, the LLM (Qwen-7B-Chat-int4) still enhances the performance of text-based KGC methods \\footnote{Code and datasets are available at \\href{https://github.com/sjlmg/CP-KGC}{https://github.com/sjlmg/CP-KGC}}. This study extends the performance limits of existing models and promotes further integration of KGC with LLMs.",
        "orkg_categories_flat": [
            "context constraint strategy",
            "contrastive learning",
            "data truncation in plms",
            "effective prompt design for llm-driven data augmentation",
            "framework termed constrained prompts for kgc (cp-kgc)",
            "polysemy in textual kgc inputs",
            "semantic compression",
            "semantic expansion",
            "semantic quality of entity descriptions",
            "soft prompts",
            "text-based knowledge graph completion",
            "use of quantized and full-scale llms (e.g., qwen-7b, gpt-3.5)"
        ],
        "papers_with_code_categories_flat": [
            "contrastive learning",
            "data augmentation",
            "graph embedding",
            "graph representation learning",
            "graphs",
            "hallucination",
            "knowledge graph completion",
            "knowledge graphs",
            "quantization",
            "text generation"
        ],
        "openalex_categories_flat": [
            "advanced graph neural networks",
            "biology",
            "computer science",
            "constraint (computer-aided design)",
            "context (archaeology)",
            "data quality and management",
            "engineering",
            "graph",
            "mechanical engineering",
            "paleontology",
            "theoretical computer science",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "COCONut: Modernizing COCO Segmentation",
        "doi": "10.1109/cvpr52733.2024.02065",
        "abstract": "Accepted at CVPR2024, data available at https://xdeng7.github.io/coconut.github.io/",
        "orkg_categories_flat": [
            "object detection",
            "universal segmentation"
        ],
        "papers_with_code_categories_flat": [
            "panoptic segmentation",
            "segmentation",
            "universal segmentation"
        ],
        "openalex_categories_flat": [
            "advanced image and video retrieval techniques",
            "advanced neural network applications",
            "artificial intelligence",
            "coco",
            "computer science",
            "industrial vision systems and defect detection"
        ],
        "openaire_categories_flat": [
            "computer and information sciences",
            "computer science - computer vision and pattern recognition",
            "computer vision and pattern recognition"
        ]
    },
    {
        "title": "Annotation Errors and NER: A Study with OntoNotes 5.0",
        "doi": "10.48550/arxiv.2406.19172",
        "abstract": "Named Entity Recognition (NER) is a well-studied problem in NLP. However, there is much less focus on studying NER datasets, compared to developing new NER models. In this paper, we employed three simple techniques to detect annotation errors in the OntoNotes 5.0 corpus for English NER, which is the largest available NER corpus for English. Our techniques corrected ~10% of the sentences in train/dev/test data. In terms of entity mentions, we corrected the span and/or type of ~8% of mentions in the dataset, while adding/deleting/splitting/merging a few more. These are large numbers of changes, considering the size of OntoNotes. We used three NER libraries to train, evaluate and compare the models trained with the original and the re-annotated datasets, which showed an average improvement of 1.23% in overall F-scores, with large (&gt;10%) improvements for some of the entity types. While our annotation error detection methods are not exhaustive and there is some manual annotation effort involved, they are largely language agnostic and can be employed with other NER datasets, and other sequence labelling tasks.",
        "orkg_categories_flat": [
            "named entity recognition",
            "natural language processing"
        ],
        "papers_with_code_categories_flat": [
            "focus",
            "named entity recognition",
            "named entity recognition (ner)",
            "named-entity-recognition",
            "natural language processing",
            "ner",
            "transformers"
        ],
        "openalex_categories_flat": [
            "annotation",
            "artificial intelligence",
            "computer science",
            "natural language processing",
            "natural language processing techniques",
            "semantic web and ontologies"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Understanding and Tackling Label Errors in Individual-Level Nature\n  Language Understanding",
        "doi": "10.48550/arxiv.2502.13297",
        "abstract": "Natural language understanding (NLU) is a task that enables machines to understand human language. Some tasks, such as stance detection and sentiment analysis, are closely related to individual subjective perspectives, thus termed individual-level NLU. Previously, these tasks are often simplified to text-level NLU tasks, ignoring individual factors. This not only makes inference difficult and unexplainable but often results in a large number of label errors when creating datasets. To address the above limitations, we propose a new NLU annotation guideline based on individual-level factors. Specifically, we incorporate other posts by the same individual and then annotate individual subjective perspectives after considering all individual posts. We use this guideline to expand and re-annotate the stance detection and topic-based sentiment analysis datasets. We find that error rates in the samples were as high as 31.7\\% and 23.3\\%. We further use large language models to conduct experiments on the re-annotation datasets and find that the large language models perform well on both datasets after adding individual factors. Both GPT-4o and Llama3-70B can achieve an accuracy greater than 87\\% on the re-annotation datasets. We also verify the effectiveness of individual factors through ablation studies. We call on future researchers to add individual factors when creating such datasets. Our re-annotation dataset can be found at https://github.com/24yearsoldstudent/Individual-NLU",
        "orkg_categories_flat": [
            "natural language processing",
            "sentiment analysis",
            "stance detection"
        ],
        "papers_with_code_categories_flat": [
            "natural language understanding",
            "sentiment analysis",
            "stance detection"
        ],
        "openalex_categories_flat": [
            "child and animal learning development",
            "cognitive psychology",
            "computer science",
            "design education and practice",
            "linguistics",
            "multi-criteria decision making",
            "natural language processing",
            "philosophy",
            "psychology"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Human Evaluation of Procedural Knowledge Graph Extraction from Text with Large Language Models",
        "doi": "10.1007/978-3-031-77792-9_26",
        "abstract": "Presentation given by Valentina Carriero about the paper with the same title at the 24th International Conference on Knowledge Engineering and Knowledge Management - Amsterdam, Netherlands - November 28, 2024.",
        "orkg_categories_flat": [
            "llms",
            "procedural knowledge extraction",
            "prompt engineering"
        ],
        "papers_with_code_categories_flat": [
            "language modeling",
            "language modelling",
            "large language model",
            "prompt engineering"
        ],
        "openalex_categories_flat": [
            "advanced text analysis techniques",
            "artificial intelligence",
            "computer science",
            "graph",
            "information extraction",
            "knowledge graph",
            "natural language processing",
            "natural language processing techniques",
            "theoretical computer science",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - human-computer interaction",
            "human-computer interaction"
        ]
    },
    {
        "title": "Structure Guided Large Language Model for SQL Generation",
        "doi": "10.48550/arxiv.2402.13284",
        "abstract": "The 42nd International Conference on Machine Learning",
        "orkg_categories_flat": [
            "development and evaluation of a sota llm-based nl-to-sql translation interface"
        ],
        "papers_with_code_categories_flat": [
            "language modeling",
            "language modelling",
            "large language model",
            "model",
            "natural language queries",
            "text to sql",
            "text-to-sql"
        ],
        "openalex_categories_flat": [
            "advanced computational techniques and applications",
            "computer science",
            "data definition language",
            "distributed and parallel computing systems",
            "natural language processing",
            "pl/sql",
            "programming language",
            "query by example",
            "scientific computing and data management",
            "search engine",
            "sql",
            "sql/psm",
            "web search query",
            "world wide web"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - databases",
            "computer science - software engineering",
            "databases",
            "software engineering"
        ]
    },
    {
        "title": "TinyLlama: An Open-Source Small Language Model",
        "doi": "10.48550/arxiv.2401.02385",
        "abstract": "We present TinyLlama, a compact 1.1B language model pretrained on around 1 trillion tokens for approximately 3 epochs. Building on the architecture and tokenizer of Llama 2, TinyLlama leverages various advances contributed by the open-source community (e.g., FlashAttention and Lit-GPT), achieving better computational efficiency. Despite its relatively small size, TinyLlama demonstrates remarkable performance in a series of downstream tasks. It significantly outperforms existing open-source language models with comparable sizes. Our model checkpoints and code are publicly available on GitHub at https://github.com/jzhang38/TinyLlama.",
        "orkg_categories_flat": [
            "generic",
            "small language models (slms) survey"
        ],
        "papers_with_code_categories_flat": [
            "attention",
            "computational efficiency",
            "flashattention",
            "general",
            "language modeling",
            "language modelling",
            "model",
            "small language model"
        ],
        "openalex_categories_flat": [
            "algorithms and data compression",
            "archaeology",
            "architecture",
            "artificial intelligence",
            "code (set theory)",
            "computer science",
            "downstream (manufacturing)",
            "economics",
            "history",
            "language model",
            "natural language processing techniques",
            "open source",
            "operations management",
            "programming language",
            "set (abstract data type)",
            "software",
            "source code",
            "topic modeling",
            "world wide web"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "Phi-3 Technical Report: A Highly Capable Language Model Locally on Your\n  Phone",
        "doi": "10.48550/arxiv.2404.14219",
        "abstract": "We introduce phi-3-mini, a 3.8 billion parameter language model trained on 3.3 trillion tokens, whose overall performance, as measured by both academic benchmarks and internal testing, rivals that of models such as Mixtral 8x7B and GPT-3.5 (e.g., phi-3-mini achieves 69% on MMLU and 8.38 on MT-bench), despite being small enough to be deployed on a phone. Our training dataset is a scaled-up version of the one used for phi-2, composed of heavily filtered publicly available web data and synthetic data. The model is also further aligned for robustness, safety, and chat format. We also provide parameter-scaling results with a 7B, 14B models trained for 4.8T tokens, called phi-3-small, phi-3-medium, both significantly more capable than phi-3-mini (e.g., respectively 75%, 78% on MMLU, and 8.7, 8.9 on MT-bench). To enhance multilingual, multimodal, and long-context capabilities, we introduce three models in the phi-3.5 series: phi-3.5-mini, phi-3.5-MoE, and phi-3.5-Vision. The phi-3.5-MoE, a 16 x 3.8B MoE model with 6.6 billion active parameters, achieves superior performance in language reasoning, math, and code tasks compared to other open-source models of similar scale, such as Llama 3.1 and the Mixtral series, and on par with Gemini-1.5-Flash and GPT-4o-mini. Meanwhile, phi-3.5-Vision, a 4.2 billion parameter model derived from phi-3.5-mini, excels in reasoning tasks and is adept at handling both single-image and text prompts, as well as multi-image and text prompts.",
        "orkg_categories_flat": [
            "generic",
            "small language model (slm) survey comparison"
        ],
        "papers_with_code_categories_flat": [
            "15 ways to contact how can i speak to someone at delta airlines",
            "activation functions",
            "adam",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "attention patterns",
            "bpe",
            "cosine annealing",
            "dense connections",
            "dropout",
            "ensembling",
            "feedforward networks",
            "fixed factorized attention",
            "gelu",
            "general",
            "gpt-3",
            "language modeling",
            "language modelling",
            "language models",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with cosine annealing",
            "llama",
            "math",
            "mmlu",
            "mmr total",
            "moe",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformers",
            "weight decay"
        ],
        "openalex_categories_flat": [
            "computational physics and python applications",
            "computer science",
            "linguistics",
            "philosophy",
            "phone"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Gemini 1.5: Unlocking multimodal understanding across millions of tokens\n  of context",
        "doi": "10.48550/arxiv.2403.05530",
        "abstract": "In this report, we introduce the Gemini 1.5 family of models, representing the next generation of highly compute-efficient multimodal models capable of recalling and reasoning over fine-grained information from millions of tokens of context, including multiple long documents and hours of video and audio. The family includes two new models: (1) an updated Gemini 1.5 Pro, which exceeds the February version on the great majority of capabilities and benchmarks; (2) Gemini 1.5 Flash, a more lightweight variant designed for efficiency with minimal regression in quality. Gemini 1.5 models achieve near-perfect recall on long-context retrieval tasks across modalities, improve the state-of-the-art in long-document QA, long-video QA and long-context ASR, and match or surpass Gemini 1.0 Ultra's state-of-the-art performance across a broad set of benchmarks. Studying the limits of Gemini 1.5's long-context ability, we find continued improvement in next-token prediction and near-perfect retrieval (&gt;99%) up to at least 10M tokens, a generational leap over existing models such as Claude 3.0 (200k) and GPT-4 Turbo (128k). Finally, we highlight real-world use cases, such as Gemini 1.5 collaborating with professionals on completing their tasks achieving 26 to 75% time savings across 10 different job categories, as well as surprising new capabilities of large language models at the frontier; when given a grammar manual for Kalamang, a language with fewer than 200 speakers worldwide, the model learns to translate English to Kalamang at a similar level to a person who learned from the same content.",
        "orkg_categories_flat": [
            "finding pre-trained large language model"
        ],
        "papers_with_code_categories_flat": [
            "1 image, 2*2 stitching",
            "absolute position encodings",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "code generation",
            "dense connections",
            "dropout",
            "feedforward networks",
            "fs-mevqa",
            "general",
            "gpt-4",
            "image retrieval",
            "label smoothing",
            "language models",
            "layer normalization",
            "linear layer",
            "long-context understanding",
            "math word problem solving",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "question answering",
            "regularization",
            "residual connection",
            "retrieval",
            "set",
            "skip connections",
            "softmax",
            "sparsity",
            "stochastic optimization",
            "subword segmentation",
            "temporal relation extraction",
            "transformer",
            "transformers",
            "video question answering",
            "visual question answering",
            "zero-shot video question answer"
        ],
        "openalex_categories_flat": [
            "archaeology",
            "business",
            "computer science",
            "context (archaeology)",
            "geography",
            "internet privacy",
            "semantic web and ontologies"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Self-Contrast: Better Reflection Through Inconsistent Solving Perspectives",
        "doi": "10.18653/v1/2024.acl-long.197",
        "abstract": "The reflection capacity of Large Language Model (LLM) has garnered extensive attention. A post-hoc prompting strategy, e.g., reflexion and self-refine, refines LLM's response based on self-evaluated or external feedback. However, recent research indicates without external feedback, LLM's intrinsic reflection is unstable. Our investigation unveils that the key bottleneck is the quality of the self-evaluated feedback. We find LLMs often exhibit overconfidence or high randomness when self-evaluate, offering stubborn or inconsistent feedback, which causes poor reflection. To remedy this, we advocate Self-Contrast: It adaptively explores diverse solving perspectives tailored to the request, contrasts the differences, and summarizes these discrepancies into a checklist which could be used to re-examine and eliminate discrepancies. Our method endows LLM with diverse perspectives to alleviate stubborn biases. Moreover, their discrepancies indicate potential errors or inherent uncertainties that LLM often overlooks. Reflecting upon these can catalyze more accurate and stable reflection. Experiments conducted on a series of reasoning and translation tasks with different LLMs serve to underscore the effectiveness and generality of our strategy.",
        "orkg_categories_flat": [
            "commonmt",
            "exploring self-refine methods in large language models",
            "gsm8k",
            "self-contrast",
            "svamp"
        ],
        "papers_with_code_categories_flat": [
            "language modeling",
            "language modelling",
            "large language model"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "computer science",
            "contrast (vision)",
            "education and critical thinking development",
            "programming language",
            "reflection (computer programming)"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Self-Refine Instruction-Tuning for Aligning Reasoning in Language Models",
        "doi": "10.48550/arxiv.2405.00402",
        "abstract": "The alignments of reasoning abilities between smaller and larger Language Models are largely conducted via Supervised Fine-Tuning (SFT) using demonstrations generated from robust Large Language Models (LLMs). Although these approaches deliver more performant models, they do not show sufficiently strong generalization ability as the training only relies on the provided demonstrations.  In this paper, we propose the Self-refine Instruction-tuning method that elicits Smaller Language Models to self-refine their abilities. Our approach is based on a two-stage process, where reasoning abilities are first transferred between LLMs and Small Language Models (SLMs) via Instruction-tuning on demonstrations provided by LLMs, and then the instructed models Self-refine their abilities through preference optimization strategies. In particular, the second phase operates refinement heuristics based on the Direct Preference Optimization algorithm, where the SLMs are elicited to deliver a series of reasoning paths by automatically sampling the generated responses and providing rewards using ground truths from the LLMs. Results obtained on commonsense and math reasoning tasks show that this approach significantly outperforms Instruction-tuning in both in-domain and out-domain scenarios, aligning the reasoning abilities of Smaller and Larger Language Models.",
        "orkg_categories_flat": [
            "common sense question answering",
            "exploring self-refine methods in large language models",
            "open book question answering",
            "physical interaction question answering",
            "social interaction question answering",
            "srit"
        ],
        "papers_with_code_categories_flat": [
            "math"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "computer science",
            "language model",
            "natural language processing",
            "natural language processing techniques",
            "speech and dialogue systems",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Pride and Prejudice: LLM Amplifies Self-Bias in Self-Refinement",
        "doi": "10.18653/v1/2024.acl-long.826",
        "abstract": "Recent studies show that large language models (LLMs) improve their performance through self-feedback on certain tasks while degrade on others. We discovered that such a contrary is due to LLM's bias in evaluating their own output. In this paper, we formally define LLM's self-bias - the tendency to favor its own generation - using two statistics. We analyze six LLMs (GPT-4, GPT-3.5, Gemini, LLaMA2, Mixtral and DeepSeek) on translation, constrained text generation, and mathematical reasoning tasks. We find that self-bias is prevalent in all examined LLMs across multiple languages and tasks. Our analysis reveals that while the self-refine pipeline improves the fluency and understandability of model outputs, it further amplifies self-bias. To mitigate such biases, we discover that larger model size and external feedback with accurate assessment can significantly reduce bias in the self-refine pipeline, leading to actual performance improvement in downstream tasks. The code and data are released at https://github.com/xu1998hz/llm_self_bias.",
        "orkg_categories_flat": [
            "exploring self-refine methods in large language models",
            "flores-200",
            "mqm",
            "self-bias"
        ],
        "papers_with_code_categories_flat": [
            "15 ways to contact how can i speak to someone at delta airlines",
            "activation functions",
            "adam",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "attention patterns",
            "bpe",
            "cosine annealing",
            "dense connections",
            "dropout",
            "feedforward networks",
            "fixed factorized attention",
            "gelu",
            "general",
            "gpt-3",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with cosine annealing",
            "mathematical reasoning",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "text generation",
            "transformers",
            "weight decay"
        ],
        "openalex_categories_flat": [
            "computer science",
            "european and international contract law",
            "law",
            "law, economics, and judicial systems",
            "legal principles and applications",
            "political science",
            "prejudice (legal term)",
            "pride",
            "psychology",
            "social psychology"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Search-in-the-Chain: Interactively Enhancing Large Language Models with Search for Knowledge-intensive Tasks",
        "doi": "10.1145/3589334.3645363",
        "abstract": "Accepted by WWW 2024",
        "orkg_categories_flat": [
            "retrieval augmented generation (rag) method"
        ],
        "papers_with_code_categories_flat": [
            "fact checking",
            "information retrieval",
            "language modelling",
            "large language model",
            "long form question answering",
            "multi-hop question answering",
            "question answering",
            "retrieval",
            "retrieval-augmented generation",
            "slot filling",
            "slot-filling"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "astronomy",
            "chain (unit)",
            "computer science",
            "human–computer interaction",
            "information retrieval",
            "natural language processing",
            "natural language processing techniques",
            "physics",
            "programming language",
            "speech and dialogue systems",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "The Power of Noise: Redefining Retrieval for RAG Systems",
        "doi": "10.1145/3626772.3657834",
        "abstract": "Retrieval-Augmented Generation (RAG) has recently emerged as a method to extend beyond the pre-trained knowledge of Large Language Models by augmenting the original prompt with relevant passages or documents retrieved by an Information Retrieval (IR) system. RAG has become increasingly important for Generative AI solutions, especially in enterprise settings or in any domain in which knowledge is constantly refreshed and cannot be memorized in the LLM. We argue here that the retrieval component of RAG systems, be it dense or sparse, deserves increased attention from the research community, and accordingly, we conduct the first comprehensive and systematic examination of the retrieval strategy of RAG systems. We focus, in particular, on the type of passages IR systems within a RAG solution should retrieve. Our analysis considers multiple factors, such as the relevance of the passages included in the prompt context, their position, and their number. One counter-intuitive finding of this work is that the retriever's highest-scoring documents that are not directly relevant to the query (e.g., do not contain the answer) negatively impact the effectiveness of the LLM. Even more surprising, we discovered that adding random documents in the prompt improves the LLM accuracy by up to 35%. These results highlight the need to investigate the appropriate strategies when integrating retrieval with LLMs, thereby laying the groundwork for future research in this area.",
        "orkg_categories_flat": [
            "retrieval augmented generation (rag) method"
        ],
        "papers_with_code_categories_flat": [
            "activation functions",
            "adam",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "bart",
            "bert",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "gelu",
            "general",
            "information retrieval",
            "language models",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with linear decay",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "rag",
            "regularization",
            "residual connection",
            "retrieval",
            "retrieval-augmented generation",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "text generation",
            "transformers",
            "weight decay",
            "wordpiece"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "computer science",
            "image (mathematics)",
            "music and audio processing",
            "natural language processing techniques",
            "noise (video)",
            "physics",
            "power (physics)",
            "quantum mechanics",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "02 engineering and technology",
            "0202 electrical engineering, electronic engineering, information engineering",
            "04 agricultural and veterinary sciences",
            "0404 agricultural biotechnology",
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language",
            "computer science - information retrieval",
            "information retrieval",
            "information retrieval; llm; rag",
            "rag; llm; information retrieval"
        ]
    },
    {
        "title": "Retrieval meets Long Context Large Language Models",
        "doi": "10.48550/arxiv.2310.03025",
        "abstract": "Extending the context window of large language models (LLMs) is getting popular recently, while the solution of augmenting LLMs with retrieval has existed for years. The natural questions are: i) Retrieval-augmentation versus long context window, which one is better for downstream tasks? ii) Can both methods be combined to get the best of both worlds? In this work, we answer these questions by studying both solutions using two state-of-the-art pretrained LLMs, i.e., a proprietary 43B GPT and Llama2-70B. Perhaps surprisingly, we find that LLM with 4K context window using simple retrieval-augmentation at generation can achieve comparable performance to finetuned LLM with 16K context window via positional interpolation on long context tasks, while taking much less computation. More importantly, we demonstrate that retrieval can significantly improve the performance of LLMs regardless of their extended context window sizes. Our best model, retrieval-augmented Llama2-70B with 32K context window, outperforms GPT-3.5-turbo-16k and Davinci003 in terms of average score on nine long context tasks including question answering, query-based summarization, and in-context few-shot learning tasks. It also outperforms its non-retrieval Llama2-70B-32k baseline by a margin, while being much faster at generation. Our study provides general insights on the choice of retrieval-augmentation versus long context extension of LLM for practitioners.",
        "orkg_categories_flat": [
            "retrieval augmented generation (rag) method"
        ],
        "papers_with_code_categories_flat": [
            "15 ways to contact how can i speak to someone at delta airlines",
            "16k",
            "4k",
            "activation functions",
            "adam",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "attention patterns",
            "bpe",
            "cosine annealing",
            "dense connections",
            "discriminative fine-tuning",
            "dropout",
            "feedforward networks",
            "few-shot learning",
            "fine-tuning",
            "fixed factorized attention",
            "gelu",
            "general",
            "gpt",
            "gpt-3",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with cosine annealing",
            "multi-head attention",
            "natural language processing",
            "natural questions",
            "normalization",
            "output functions",
            "question answering",
            "regularization",
            "residual connection",
            "retrieval",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformers",
            "weight decay"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "automatic summarization",
            "biology",
            "computer science",
            "context (archaeology)",
            "information retrieval",
            "machine learning",
            "margin (machine learning)",
            "multimodal machine learning applications",
            "natural language processing techniques",
            "paleontology",
            "topic modeling",
            "window (computing)",
            "world wide web"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - computer vision and pattern recognition",
            "computer science - information retrieval",
            "computer science - machine learning",
            "computer vision and pattern recognition",
            "information retrieval",
            "machine learning"
        ]
    },
    {
        "title": "RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval",
        "doi": "10.48550/arxiv.2401.18059",
        "abstract": "Retrieval-augmented language models can better adapt to changes in world state and incorporate long-tail knowledge. However, most existing methods retrieve only short contiguous chunks from a retrieval corpus, limiting holistic understanding of the overall document context. We introduce the novel approach of recursively embedding, clustering, and summarizing chunks of text, constructing a tree with differing levels of summarization from the bottom up. At inference time, our RAPTOR model retrieves from this tree, integrating information across lengthy documents at different levels of abstraction. Controlled experiments show that retrieval with recursive summaries offers significant improvements over traditional retrieval-augmented LMs on several tasks. On question-answering tasks that involve complex, multi-step reasoning, we show state-of-the-art results; for example, by coupling RAPTOR retrieval with the use of GPT-4, we can improve the best performance on the QuALITY benchmark by 20% in absolute accuracy.",
        "orkg_categories_flat": [
            "retrieval augmented generation (rag) method"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "general",
            "gpt-4",
            "label smoothing",
            "language models",
            "layer normalization",
            "linear layer",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "question answering",
            "regularization",
            "residual connection",
            "retrieval",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "combinatorics",
            "computer science",
            "data management and algorithms",
            "information retrieval",
            "mathematics",
            "semantic web and ontologies",
            "topic modeling",
            "tree (set theory)"
        ],
        "openaire_categories_flat": [
            "computer science - computation and language",
            "computer science - machine learning"
        ]
    },
    {
        "title": "Corrective Retrieval Augmented Generation",
        "doi": "10.48550/arxiv.2401.15884",
        "abstract": "Large language models (LLMs) inevitably exhibit hallucinations since the accuracy of generated texts cannot be secured solely by the parametric knowledge they encapsulate. Although retrieval-augmented generation (RAG) is a practicable complement to LLMs, it relies heavily on the relevance of retrieved documents, raising concerns about how the model behaves if retrieval goes wrong. To this end, we propose the Corrective Retrieval Augmented Generation (CRAG) to improve the robustness of generation. Specifically, a lightweight retrieval evaluator is designed to assess the overall quality of retrieved documents for a query, returning a confidence degree based on which different knowledge retrieval actions can be triggered. Since retrieval from static and limited corpora can only return sub-optimal documents, large-scale web searches are utilized as an extension for augmenting the retrieval results. Besides, a decompose-then-recompose algorithm is designed for retrieved documents to selectively focus on key information and filter out irrelevant information in them. CRAG is plug-and-play and can be seamlessly coupled with various RAG-based approaches. Experiments on four datasets covering short- and long-form generation tasks show that CRAG can significantly improve the performance of RAG-based approaches.",
        "orkg_categories_flat": [
            "retrieval augmented generation (rag) method"
        ],
        "papers_with_code_categories_flat": [
            "focus",
            "natural language processing",
            "rag",
            "retrieval",
            "retrieval-augmented generation",
            "transformers"
        ],
        "openalex_categories_flat": [
            "computer science",
            "information retrieval",
            "speech and dialogue systems"
        ],
        "openaire_categories_flat": [
            "004",
            "artificial intelligence",
            "artificial intelligence and robotics",
            "computation and language",
            "computer and information sciences",
            "computer applications to medicine. medical informatics",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - machine learning",
            "computer science - software engineering",
            "covid-19",
            "cyber security",
            "cybersecurity",
            "humans",
            "information storage and retrieval",
            "large language models",
            "machine learning",
            "public aspects of medicine",
            "r858-859.7",
            "ra1-1270",
            "retrieval-augmented generation",
            "sars-cov-2",
            "security test automation",
            "security test case generation",
            "software engineering",
            "software security"
        ]
    },
    {
        "title": "UniMS-RAG: A Unified Multi-source Retrieval-Augmented Generation for Personalized Dialogue Systems",
        "doi": "10.48550/arxiv.2401.13256",
        "abstract": "Large Language Models (LLMs) has shown exceptional capabilities in many natual language understanding and generation tasks. However, the personalization issue still remains a much-coveted property, especially when it comes to the multiple sources involved in the dialogue system. To better plan and incorporate the use of multiple sources in generating personalized response, we firstly decompose it into three sub-tasks: Knowledge Source Selection, Knowledge Retrieval, and Response Generation. We then propose a novel Unified Multi-Source Retrieval-Augmented Generation system (UniMS-RAG) Specifically, we unify these three sub-tasks with different formulations into the same sequence-to-sequence paradigm during the training, to adaptively retrieve evidences and evaluate the relevance on-demand using special tokens, called acting tokens and evaluation tokens. Enabling language models to generate acting tokens facilitates interaction with various knowledge sources, allowing them to adapt their behavior to diverse task requirements. Meanwhile, evaluation tokens gauge the relevance score between the dialogue context and the retrieved evidence. In addition, we carefully design a self-refinement mechanism to iteratively refine the generated response considering 1) the consistency scores between the generated response and retrieved evidence; and 2) the relevance scores. Experiments on two personalized datasets (DuLeMon and KBP) show that UniMS-RAG achieves state-of-the-art performance on the knowledge source selection and response generation task with itself as a retriever in a unified manner. Extensive analyses and discussions are provided for shedding some new perspectives for personalized dialogue systems.",
        "orkg_categories_flat": [
            "retrieval augmented generation (rag) method"
        ],
        "papers_with_code_categories_flat": [
            "rag",
            "response generation",
            "retrieval",
            "retrieval-augmented generation"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "biology",
            "computer science",
            "consistency (knowledge bases)",
            "context (archaeology)",
            "economics",
            "information retrieval",
            "language model",
            "law",
            "management",
            "natural language processing",
            "natural language processing techniques",
            "paleontology",
            "personalization",
            "political science",
            "relevance (law)",
            "selection (genetic algorithm)",
            "speech and dialogue systems",
            "task (project management)",
            "topic modeling",
            "world wide web"
        ],
        "openaire_categories_flat": [
            "02 engineering and technology",
            "0202 electrical engineering, electronic engineering, information engineering",
            "04 agricultural and veterinary sciences",
            "0404 agricultural biotechnology",
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "FABULA: Intelligence Report Generation Using Retrieval-Augmented Narrative Construction",
        "doi": "10.1145/3625007.3627505",
        "abstract": "Narrative construction is the process of representing disparate event information into a logical plot structure that models an end to end story. Intelligence analysis is an example of a domain that can benefit tremendously from narrative construction techniques, particularly in aiding analysts during the largely manual and costly process of synthesizing event information into comprehensive intelligence reports. Manual intelligence report generation is often prone to challenges such as integrating dynamic event information, writing fine-grained queries, and closing information gaps. This motivates the development of a system that retrieves and represents critical aspects of events in a form that aids in automatic generation of intelligence reports.  We introduce a Retrieval Augmented Generation (RAG) approach to augment prompting of an autoregressive decoder by retrieving structured information asserted in a knowledge graph to generate targeted information based on a narrative plot model. We apply our approach to the problem of neural intelligence report generation and introduce FABULA, framework to augment intelligence analysis workflows using RAG. An analyst can use FABULA to query an Event Plot Graph (EPG) to retrieve relevant event plot points, which can be used to augment prompting of a Large Language Model (LLM) during intelligence report generation. Our evaluation studies show that the plot points included in the generated intelligence reports have high semantic relevance, high coherency, and low data redundancy.",
        "orkg_categories_flat": [
            "retrieval augmented generation (rag) method"
        ],
        "papers_with_code_categories_flat": [
            "activation functions",
            "adam",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "bart",
            "bert",
            "bpe",
            "decoder",
            "dense connections",
            "dropout",
            "feedforward networks",
            "gelu",
            "general",
            "language modelling",
            "language models",
            "large language model",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with linear decay",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "rag",
            "regularization",
            "residual connection",
            "retrieval",
            "retrieval-augmented generation",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformers",
            "weight decay",
            "wordpiece"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "computer science",
            "data science",
            "database",
            "event (particle physics)",
            "graph",
            "information retrieval",
            "linguistics",
            "mathematics",
            "narrative",
            "natural language processing",
            "operating system",
            "philosophy",
            "physics",
            "plot (graphics)",
            "quantum mechanics",
            "redundancy (engineering)",
            "scientific computing and data management",
            "statistics",
            "theoretical computer science",
            "topic modeling",
            "web data mining and analysis",
            "workflow"
        ],
        "openaire_categories_flat": [
            "05 social sciences",
            "0501 psychology and cognitive sciences",
            "0509 other social sciences",
            "computer and information sciences",
            "computer science - information retrieval",
            "information retrieval"
        ]
    },
    {
        "title": "Reasoning on Graphs: Faithful and Interpretable Large Language Model Reasoning",
        "doi": "10.48550/arxiv.2310.01061",
        "abstract": "Accepted by ICLR 2024",
        "orkg_categories_flat": [
            "retrieval augmented generation (rag) method"
        ],
        "papers_with_code_categories_flat": [
            "knowledge graphs",
            "language modeling",
            "language modelling",
            "large language model",
            "retrieval",
            "valid"
        ],
        "openalex_categories_flat": [
            "advanced graph neural networks",
            "analytic reasoning",
            "artificial intelligence",
            "automated reasoning",
            "benchmark (surveying)",
            "case-based reasoning",
            "cognition",
            "cognitive science",
            "computer science",
            "deductive reasoning",
            "geodesy",
            "geography",
            "inference",
            "knowledge representation and reasoning",
            "model-based reasoning",
            "natural language processing",
            "natural language processing techniques",
            "neuroscience",
            "opportunistic reasoning",
            "psychology",
            "reasoning system",
            "topic modeling",
            "verbal reasoning"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "G-Retriever: Retrieval-Augmented Generation for Textual Graph\n  Understanding and Question Answering",
        "doi": "10.48550/arxiv.2402.07630",
        "abstract": "Given a graph with textual attributes, we enable users to `chat with their graph': that is, to ask questions about the graph using a conversational interface. In response to a user's questions, our method provides textual replies and highlights the relevant parts of the graph. While existing works integrate large language models (LLMs) and graph neural networks (GNNs) in various ways, they mostly focus on either conventional graph tasks (such as node, edge, and graph classification), or on answering simple graph queries on small or synthetic graphs. In contrast, we develop a flexible question-answering framework targeting real-world textual graphs, applicable to multiple applications including scene graph understanding, common sense reasoning, and knowledge graph reasoning. Toward this goal, we first develop a Graph Question Answering (GraphQA) benchmark with data collected from different tasks. Then, we propose our G-Retriever method, introducing the first retrieval-augmented generation (RAG) approach for general textual graphs, which can be fine-tuned to enhance graph understanding via soft prompting. To resist hallucination and to allow for textual graphs that greatly exceed the LLM's context window size, G-Retriever performs RAG over a graph by formulating this task as a Prize-Collecting Steiner Tree optimization problem. Empirical evaluations show that our method outperforms baselines on textual graph tasks from multiple domains, scales well with larger graph sizes, and mitigates hallucination.~\\footnote{Our codes and datasets are available at: \\url{https://github.com/XiaoxinHe/G-Retriever}}",
        "orkg_categories_flat": [
            "retrieval augmented generation (rag) method"
        ],
        "papers_with_code_categories_flat": [
            "activation functions",
            "adam",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "bart",
            "bert",
            "bpe",
            "common sense reasoning",
            "dense connections",
            "dropout",
            "feedforward networks",
            "focus",
            "gelu",
            "general",
            "graph classification",
            "graph question answering",
            "hallucination",
            "language models",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with linear decay",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "question answering",
            "rag",
            "regularization",
            "residual connection",
            "retrieval",
            "retrieval-augmented generation",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformers",
            "weight decay",
            "wordpiece"
        ],
        "openalex_categories_flat": [
            "advanced graph neural networks",
            "advanced text analysis techniques",
            "computer science",
            "graph",
            "information retrieval",
            "labrador retriever",
            "medicine",
            "natural language processing",
            "pathology",
            "question answering",
            "theoretical computer science",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "computer and information sciences",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "Generating Benchmarks for Factuality Evaluation of Language Models",
        "doi": "10.48550/arxiv.2307.06908",
        "abstract": "Before deploying a language model (LM) within a given domain, it is important to measure its tendency to generate factually incorrect information in that domain. Existing methods for factuality evaluation of LLM generation focus on facts sampled from the LM itself, and thus do not control the set of evaluated facts and might under-represent domain specific or rare facts. We propose FACTOR: Factual Assessment via Corpus TransfORmation, a scalable approach for evaluating LM factuality. FACTOR automatically transforms a factual corpus of interest into a benchmark evaluating an LM's propensity to generate true facts from the corpus vs. similar but incorrect statements. We use our framework to create three benchmarks: Wiki-FACTOR, News-FACTOR and Expert-FACTOR. We show that: (i) our benchmark scores increase with model size and improve when the LM is augmented with retrieval; (ii) benchmark score and perplexity do not always agree on model ranking; (iii) when perplexity and benchmark score disagree, the latter better reflects factuality in open-ended generation, as measured by human annotators. We make our data and code publicly available in https://github.com/AI21Labs/factor.",
        "orkg_categories_flat": [
            "ask inputquestion",
            "exploration of llm hallucination benchmarks",
            "task labelanswer",
            "task metriclikelihood",
            "task typemulti-choice qa"
        ],
        "papers_with_code_categories_flat": [
            "focus",
            "language modeling",
            "language modelling",
            "natural language processing",
            "retrieval",
            "transformers"
        ],
        "openalex_categories_flat": [
            "computer science",
            "linguistics",
            "natural language processing",
            "natural language processing techniques",
            "philosophy",
            "programming language",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - computer vision and pattern recognition",
            "computer science - information retrieval",
            "computer vision and pattern recognition",
            "information retrieval"
        ]
    },
    {
        "title": "BAMBOO: A Comprehensive Benchmark for Evaluating Long Text Modeling Capacities of Large Language Models",
        "doi": "10.48550/arxiv.2309.13345",
        "abstract": "Large language models (LLMs) have achieved dramatic proficiency over NLP tasks with normal length. Recently, multiple studies have committed to extending the context length and enhancing the long text modeling capabilities of LLMs. To comprehensively evaluate the long context ability of LLMs, we propose BAMBOO, a multi-task long context benchmark. BAMBOO has been designed with four principles: comprehensive capacity evaluation, avoidance of data contamination, accurate automatic evaluation, and different length levels. It consists of 10 datasets from 5 different long text understanding tasks, i.e. question answering, hallucination detection, text sorting, language modeling, and code completion, to cover core capacities and various domains of LLMs. We conduct experiments with five long context models on BAMBOO and further discuss four key research questions of long text. We also qualitatively analyze current long context models and point out future directions for enhancing long text modeling capacities. We release our data, prompts, and code at https://github.com/RUCAIBox/BAMBOO.",
        "orkg_categories_flat": [
            "exploration of llm hallucination benchmarks",
            "task inputpaper",
            "task labelsummary",
            "task metricprecision",
            "task typedetection"
        ],
        "papers_with_code_categories_flat": [
            "code completion",
            "hallucination",
            "language modeling",
            "language modelling",
            "question answering"
        ],
        "openalex_categories_flat": [
            "archaeology",
            "artificial intelligence",
            "bamboo",
            "benchmark (surveying)",
            "biology",
            "cartography",
            "code (set theory)",
            "computer science",
            "context (archaeology)",
            "context model",
            "data science",
            "ecology",
            "engineering",
            "geography",
            "geometry",
            "language model",
            "mathematics",
            "natural language processing",
            "natural language processing techniques",
            "object (grammar)",
            "point (geometry)",
            "programming language",
            "set (abstract data type)",
            "sorting",
            "systems engineering",
            "task (project management)",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "PURPLE: Making a Large Language Model a Better SQL Writer",
        "doi": "10.1109/icde60146.2024.00009",
        "abstract": "Large Language Model (LLM) techniques play an increasingly important role in Natural Language to SQL (NL2SQL) translation. LLMs trained by extensive corpora have strong natural language understanding and basic SQL generation abilities without additional tuning specific to NL2SQL tasks. Existing LLMs-based NL2SQL approaches try to improve the translation by enhancing the LLMs with an emphasis on user intention understanding. However, LLMs sometimes fail to generate appropriate SQL due to their lack of knowledge in organizing complex logical operator composition. A promising method is to input the LLMs with demonstrations, which include known NL2SQL translations from various databases. LLMs can learn to organize operator compositions from the input demonstrations for the given task. In this paper, we propose PURPLE (Pre-trained models Utilized to Retrieve Prompts for Logical Enhancement), which improves accuracy by retrieving demonstrations containing the requisite logical operator composition for the NL2SQL task on hand, thereby guiding LLMs to produce better SQL translation. PURPLE achieves a new state-of-the-art performance of 80.5% exact-set match accuracy and 87.8% execution match accuracy on the validation set of the popular NL2SQL benchmark Spider. PURPLE maintains high accuracy across diverse benchmarks, budgetary constraints, and various LLMs, showing robustness and cost-effectiveness.",
        "orkg_categories_flat": [
            "development and evaluation of a sota llm-based nl-to-sql translation interface"
        ],
        "papers_with_code_categories_flat": [
            "general",
            "language modeling",
            "language modelling",
            "large language model",
            "natural language understanding",
            "set",
            "sparsity",
            "translation"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "computational physics and python applications",
            "computer science",
            "distributed and parallel computing systems",
            "natural language processing",
            "programming language",
            "scientific computing and data management",
            "sql"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - databases",
            "databases"
        ]
    },
    {
        "title": "Metasql: A Generate-Then-Rank Framework for Natural Language to SQL Translation",
        "doi": "10.1109/icde60146.2024.00143",
        "abstract": "The Natural Language Interface to Databases (NLIDB) empowers non-technical users with database access through intuitive natural language (NL) interactions. Advanced approaches, utilizing neural sequence-to-sequence models or large-scale language models, typically employ auto-regressive decoding to generate unique SQL queries sequentially. While these translation models have greatly improved the overall translation accuracy, surpassing 70% on NLIDB benchmarks, the use of auto-regressive decoding to generate single SQL queries may result in sub-optimal outputs, potentially leading to erroneous translations. In this paper, we propose Metasql, a unified generate-then-rank framework that can be flexibly incorporated with existing NLIDBs to consistently improve their translation accuracy. Metasql introduces query metadata to control the generation of better SQL query candidates and uses learning-to-rank algorithms to retrieve globally optimized queries. Specifically, Metasql first breaks down the meaning of the given NL query into a set of possible query metadata, representing the basic concepts of the semantics. These metadata are then used as language constraints to steer the underlying translation model toward generating a set of candidate SQL queries. Finally, Metasql ranks the candidates to identify the best matching one for the given NL query. Extensive experiments are performed to study Metasql on two public NLIDB benchmarks. The results show that the performance of the translation models can be effectively improved using Metasql.",
        "orkg_categories_flat": [
            "development and evaluation of a sota llm-based nl-to-sql translation interface"
        ],
        "papers_with_code_categories_flat": [
            "general",
            "learning-to-rank",
            "set",
            "sparsity",
            "translation"
        ],
        "openalex_categories_flat": [
            "advanced database systems and queries",
            "archaeology",
            "artificial intelligence",
            "biochemistry",
            "chemistry",
            "combinatorics",
            "computer science",
            "gene",
            "history",
            "machine translation",
            "mathematics",
            "mathematics, computing, and information processing",
            "messenger rna",
            "natural (archaeology)",
            "natural language",
            "natural language processing",
            "programming language",
            "rank (graph theory)",
            "scientific computing and data management",
            "sql",
            "translation (biology)"
        ],
        "openaire_categories_flat": [
            "computer science - artificial intelligence",
            "computer science - databases"
        ]
    },
    {
        "title": "Middleware for LLMs: Tools Are Instrumental for Language Agents in Complex Environments",
        "doi": "10.18653/v1/2024.emnlp-main.436",
        "abstract": "EMNLP'2024; 18 pages, 8 figures, 8 tables",
        "orkg_categories_flat": [
            "development and evaluation of a sota llm-based nl-to-sql translation interface"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "general",
            "gpt-4",
            "label smoothing",
            "language models",
            "layer normalization",
            "linear layer",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "computer science",
            "computer security",
            "distributed computing",
            "middleware (distributed applications)",
            "multi-agent systems and negotiation",
            "natural language processing techniques",
            "semantic web and ontologies"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Knowledge-to-SQL: Enhancing SQL Generation with Data Expert LLM",
        "doi": "10.18653/v1/2024.findings-acl.653",
        "abstract": "Generating accurate SQL queries for user questions (text-to-SQL) has been a long-standing challenge since it requires a deep understanding of both the user's question and the corresponding database schema in order to retrieve the desired content accurately. Existing methods rely on the comprehensive capability of large language models (LLMs) to generate the SQL. However, some necessary knowledge is not explicitly included in the database schema and user question or has been learned by LLMs. Thus, the generated SQL of the knowledge-insufficient questions may be inaccurate, negatively influencing the text-to-SQL models' performance and robustness. To address this challenge, we propose the Knowledge-to-SQL framework, which employs tailored Data Expert LLM (DELLM) to provide helpful knowledge for all text-to-SQL models. Specifically, we introduce the detailed implementation of DELLM regarding table reading and the basic fine-tuning process. We further propose a Preference Learning via Database Feedback (PLDBF) strategy, refining the DELLM to generate more helpful knowledge for LLMs. Extensive experiments verify that DELLM can enhance the state-of-the-art approaches for text-to-SQL tasks. The corresponding code of DELLM is released for further research.",
        "orkg_categories_flat": [
            "development and evaluation of a sota llm-based nl-to-sql translation interface"
        ],
        "papers_with_code_categories_flat": [
            "text to sql",
            "text-to-sql"
        ],
        "openalex_categories_flat": [
            "advanced database systems and queries",
            "computer science",
            "data definition language",
            "data transformation services",
            "database",
            "pl/sql",
            "query by example",
            "scientific computing and data management",
            "search engine",
            "semantic web and ontologies",
            "sql",
            "sql/psm",
            "stored procedure",
            "web search query",
            "world wide web"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Improving Demonstration Diversity by Human-Free Fusing for Text-to-SQL",
        "doi": "10.18653/v1/2024.findings-emnlp.65",
        "abstract": "Currently, the in-context learning method based on large language models (LLMs) has become the mainstream of text-to-SQL research. Previous works have discussed how to select demonstrations related to the user question from a human-labeled demonstration pool. However, human labeling suffers from the limitations of insufficient diversity and high labeling overhead. Therefore, in this paper, we discuss how to measure and improve the diversity of the demonstrations for text-to-SQL. We present a metric to measure the diversity of the demonstrations and analyze the insufficient of the existing labeled data by experiments. Based on the above discovery, we propose fusing iteratively for demonstrations (Fused) to build a high-diversity demonstration pool through human-free multiple-iteration synthesis, improving diversity and lowering label cost. Our method achieves an average improvement of 3.2% and 5.0% with and without human labeling on several mainstream datasets, which proves the effectiveness of Fused.",
        "orkg_categories_flat": [
            "development and evaluation of a sota llm-based nl-to-sql translation interface"
        ],
        "papers_with_code_categories_flat": [
            "diversity",
            "in-context learning",
            "text to sql",
            "text-to-sql"
        ],
        "openalex_categories_flat": [
            "advanced database systems and queries",
            "anthropology",
            "artificial intelligence",
            "cloud computing and resource management",
            "computer science",
            "diversity (politics)",
            "natural language processing",
            "programming language",
            "scientific computing and data management",
            "sociology",
            "sql"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Decomposition for Enhancing Attention: Improving LLM-based Text-to-SQL through Workflow Paradigm",
        "doi": "10.18653/v1/2024.findings-acl.641",
        "abstract": "In-context learning of large-language models (LLMs) has achieved remarkable success in the field of natural language processing, while extensive case studies reveal that the single-step chain-of-thought prompting approach faces challenges such as attention diffusion and inadequate performance in complex tasks like text-to-SQL. To improve the contextual learning capabilities of LLMs in text-to-SQL, a workflow paradigm method is proposed, aiming to enhance the attention and problem-solving scope of LLMs through decomposition. Specifically, the information determination module for eliminating redundant information and the brand-new prompt structure based on problem classification greatly enhance the model's attention. Additionally, the inclusion of self-correction and active learning modules greatly expands the problem-solving scope of LLMs, hence improving the upper limit of LLM-based approaches. Extensive experiments conducted on three datasets demonstrate that our approach outperforms other methods by a significant margin. About 2-3 percentage point improvements compared to the existing baseline on the Spider Dev, Spider-Realistic, and Bird Dev datasets and new SOTA results on the Spider Test dataset are achieved. Our code is available on GitHub: \\url{https://github.com/FlyingFeather/DEA-SQL}.",
        "orkg_categories_flat": [
            "development and evaluation of a sota llm-based nl-to-sql translation interface"
        ],
        "papers_with_code_categories_flat": [
            "active learning",
            "computer vision",
            "diffusion",
            "image generation models",
            "in-context learning",
            "text to sql",
            "text-to-sql"
        ],
        "openalex_categories_flat": [
            "advanced computational techniques and applications",
            "chemistry",
            "computer science",
            "database",
            "decomposition",
            "distributed and parallel computing systems",
            "organic chemistry",
            "scientific computing and data management",
            "software engineering",
            "sql",
            "workflow"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "ARES: An Automated Evaluation Framework for Retrieval-Augmented Generation Systems",
        "doi": "10.18653/v1/2024.naacl-long.20",
        "abstract": "Evaluating retrieval-augmented generation (RAG) systems traditionally relies on hand annotations for input queries, passages to retrieve, and responses to generate. We introduce ARES, an Automated RAG Evaluation System, for evaluating RAG systems along the dimensions of context relevance, answer faithfulness, and answer relevance. By creating its own synthetic training data, ARES finetunes lightweight LM judges to assess the quality of individual RAG components. To mitigate potential prediction errors, ARES utilizes a small set of human-annotated datapoints for prediction-powered inference (PPI). Across eight different knowledge-intensive tasks in KILT, SuperGLUE, and AIS, ARES accurately evaluates RAG systems while using only a few hundred human annotations during evaluation. Furthermore, ARES judges remain effective across domain shifts, proving accurate even after changing the type of queries and/or documents used in the evaluated RAG systems. We make our code and datasets publicly available on Github.",
        "orkg_categories_flat": [
            "ares",
            "exploration of fine-tuning methods in single-llm evaluation in llm-as-judges"
        ],
        "papers_with_code_categories_flat": [
            "activation functions",
            "adam",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "bart",
            "bert",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "gelu",
            "general",
            "language models",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with linear decay",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "rag",
            "regularization",
            "residual connection",
            "retrieval",
            "retrieval-augmented generation",
            "set",
            "skip connections",
            "softmax",
            "sparsity",
            "stochastic optimization",
            "subword segmentation",
            "transformers",
            "weight decay",
            "wordpiece"
        ],
        "openalex_categories_flat": [
            "computer architecture",
            "computer science",
            "information retrieval",
            "power systems and technologies"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - information retrieval",
            "information retrieval"
        ]
    },
    {
        "title": "CompassJudger-1: All-in-one Judge Model Helps Model Evaluation and\n  Evolution",
        "doi": "10.48550/arxiv.2410.16256",
        "abstract": "Technical Report, Code and Models: https://github.com/open-compass/CompassJudger",
        "orkg_categories_flat": [
            "compassjudger-1",
            "exploration of fine-tuning methods in single-llm evaluation in llm-as-judges"
        ],
        "papers_with_code_categories_flat": [
            "all",
            "attention",
            "attention mechanisms",
            "general",
            "model",
            "output functions",
            "softmax"
        ],
        "openalex_categories_flat": [
            "computer science",
            "evolutionary algorithms and applications",
            "scientific computing and data management",
            "simulation techniques and applications"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Automating psychological hypothesis generation with AI: when large language models meet causal graph",
        "doi": "10.1057/s41599-024-03407-5",
        "abstract": "<jats:title>Abstract</jats:title><jats:p>Leveraging the synergy between causal knowledge graphs and a large language model (LLM), our study introduces a groundbreaking approach for computational hypothesis generation in psychology. We analyzed 43,312 psychology articles using a LLM to extract causal relation pairs. This analysis produced a specialized causal graph for psychology. Applying link prediction algorithms, we generated 130 potential psychological hypotheses focusing on “well-being”, then compared them against research ideas conceived by doctoral scholars and those produced solely by the LLM. Interestingly, our combined approach of a LLM and causal graphs mirrored the expert-level insights in terms of novelty, clearly surpassing the LLM-only hypotheses (<jats:italic>t</jats:italic>(59) = 3.34,<jats:italic>p</jats:italic> = 0.007 and<jats:italic>t</jats:italic>(59) = 4.32,<jats:italic>p</jats:italic> &lt; 0.001, respectively). This alignment was further corroborated using deep semantic analysis. Our results show that combining LLM with machine learning techniques such as causal knowledge graphs can revolutionize automated discovery in psychology, extracting novel insights from the extensive literature. This work stands at the crossroads of psychology and artificial intelligence, championing a new enriched paradigm for data-driven hypothesis generation in psychological research.</jats:p>",
        "orkg_categories_flat": [
            "causal pair extraction",
            "combining causal kg with llm",
            "computational hypothesis generation",
            "machine learning",
            "psychological hypothesis generation",
            "semantic analysis"
        ],
        "papers_with_code_categories_flat": [
            "articles",
            "knowledge graphs",
            "language modeling",
            "language modelling",
            "large language model",
            "link prediction"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "cognitive abilities and testing",
            "cognitive science and mapping",
            "computer science",
            "graph",
            "mental health research topics",
            "natural language processing",
            "psychology",
            "theoretical computer science"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "az20-999",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computers and society",
            "computers and society",
            "h",
            "history of scholarship and learning. the humanities",
            "social sciences"
        ]
    },
    {
        "title": "Gemma 2: Improving Open Language Models at a Practical Size",
        "doi": "10.48550/arxiv.2408.00118",
        "abstract": "In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer competitive alternatives to models that are 2-3 times bigger. We release all our models to the community.",
        "orkg_categories_flat": [
            "generic",
            "small language models (slms) survey"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "general",
            "knowledge distillation",
            "label smoothing",
            "layer normalization",
            "linear layer",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "biology",
            "botany",
            "computer science",
            "gemma",
            "natural language processing techniques"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "calidad de la atención de salud",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "evaluación de procesos en fisioterapia",
            "evaluación de resultado en la atención de salud",
            "farmacología & terapéutica",
            "health care outcome evaluation",
            "health care quality indicators",
            "indicadores de calidad de la atención de salud",
            "indicadores para la evaluación de la calidad de la atención en fisioterapia",
            "indicators for the evaluation of the quality of care in physiotherapy",
            "modalidades de fisioterapia",
            "outcome evaluation in health care",
            "physical therapy modalities",
            "process evaluation",
            "process evaluation in physiotherapy"
        ]
    },
    {
        "title": "MobileLLM: Optimizing Sub-billion Parameter Language Models for\n  On-Device Use Cases",
        "doi": "10.48550/arxiv.2402.14905",
        "abstract": "ICML 2024. Code is available at https://github.com/facebookresearch/MobileLLM",
        "orkg_categories_flat": [
            "generic",
            "small language models (slms) survey"
        ],
        "papers_with_code_categories_flat": [
            "attention",
            "attention mechanisms",
            "dense connections",
            "feedforward network",
            "feedforward networks",
            "focus",
            "general",
            "grouped-query attention",
            "natural language processing",
            "output functions",
            "softmax",
            "transformers"
        ],
        "openalex_categories_flat": [
            "computer science",
            "context-aware activity recognition systems",
            "econometrics",
            "economics",
            "green it and sustainability",
            "multimedia communication and technology"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "Compact Language Models via Pruning and Knowledge Distillation",
        "doi": "10.48550/arxiv.2407.14679",
        "abstract": "Large language models (LLMs) targeting different deployment scales and sizes are currently produced by training each variant from scratch; this is extremely compute-intensive. In this paper, we investigate if pruning an existing LLM and then re-training it with a fraction (&lt;3%) of the original training data can be a suitable alternative to repeated, full retraining. To this end, we develop a set of practical and effective compression best practices for LLMs that combine depth, width, attention and MLP pruning with knowledge distillation-based retraining; we arrive at these best practices through a detailed empirical exploration of pruning strategies for each axis, methods to combine axes, distillation strategies, and search techniques for arriving at optimal compressed architectures. We use this guide to compress the Nemotron-4 family of LLMs by a factor of 2-4x, and compare their performance to similarly-sized models on a variety of language modeling tasks. Deriving 8B and 4B models from an already pretrained 15B model using our approach requires up to 40x fewer training tokens per model compared to training from scratch; this results in compute cost savings of 1.8x for training the full model family (15B, 8B, and 4B). Minitron models exhibit up to a 16% improvement in MMLU scores compared to training from scratch, perform comparably to other community models such as Mistral 7B, Gemma 7B and Llama-3 8B, and outperform state-of-the-art compression techniques from the literature. We have open-sourced Minitron model weights on Huggingface, with corresponding supplementary material including example code available on GitHub.",
        "orkg_categories_flat": [
            "generic",
            "small language models (slms) survey"
        ],
        "papers_with_code_categories_flat": [
            "attention",
            "attention mechanisms",
            "general",
            "knowledge distillation",
            "language modeling",
            "language modelling",
            "mmlu",
            "model compression",
            "output functions",
            "pruning",
            "set",
            "softmax",
            "sparsity"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "biology",
            "botany",
            "chemistry",
            "chromatography",
            "computer science",
            "distillation",
            "mathematics",
            "natural language processing",
            "natural language processing techniques",
            "pruning"
        ],
        "openaire_categories_flat": [
            "02 engineering and technology",
            "0202 electrical engineering, electronic engineering, information engineering",
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - computer vision and pattern recognition",
            "computer science - machine learning",
            "computer vision and pattern recognition",
            "machine learning"
        ]
    },
    {
        "title": "Self-Refinement of Language Models from External Proxy Metrics Feedback",
        "doi": "10.48550/arxiv.2403.00827",
        "abstract": "It is often desirable for Large Language Models (LLMs) to capture multiple objectives when providing a response. In document-grounded response generation, for example, agent responses are expected to be relevant to a user's query while also being grounded in a given document. In this paper, we introduce Proxy Metric-based Self-Refinement (ProMiSe), which enables an LLM to refine its own initial response along key dimensions of quality guided by external metrics feedback, yielding an overall better final response. ProMiSe leverages feedback on response quality through principle-specific proxy metrics, and iteratively refines its response one principle at a time. We apply ProMiSe to open source language models Flan-T5-XXL and Llama-2-13B-Chat, to evaluate its performance on document-grounded question answering datasets, MultiDoc2Dial and QuAC, demonstrating that self-refinement improves response quality. We further show that fine-tuning Llama-2-13B-Chat on the synthetic dialogue data generated by ProMiSe yields significant performance improvements over the zero-shot baseline as well as a supervised fine-tuned model on human annotated data.",
        "orkg_categories_flat": [
            "exploring self-refine methods in large language models",
            "multidoc2dia",
            "promise",
            "quac"
        ],
        "papers_with_code_categories_flat": [
            "question answering",
            "response generation"
        ],
        "openalex_categories_flat": [
            "computer science",
            "econometrics",
            "economics",
            "machine learning",
            "natural language processing techniques",
            "proxy (statistics)"
        ],
        "openaire_categories_flat": [
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - machine learning"
        ]
    },
    {
        "title": "CYCLE: Learning to Self-Refine the Code Generation",
        "doi": "10.1145/3649825",
        "abstract": "<jats:p>Pre-trained code language models have achieved promising performance in code generation and improved the programming efficiency of human developers. However, their self-refinement capability is typically overlooked by the existing evaluations of code LMs, which focus only on the accuracy of the one-time prediction. For the cases when code LMs fail to implement the correct program, developers actually find it hard to debug and fix the faulty prediction since it is not written by the developers themselves. Unfortunately, our study reveals that code LMs cannot efficiently self-refine their faulty generations as well.</jats:p>           <jats:p>In this paper, we propose CYCLE framework, learning to self-refine the faulty generation according to the available feedback, such as the execution results reported by the test suites. We evaluate CYCLE on three popular code generation benchmarks, HumanEval, MBPP, and APPS. The results reveal that CYCLE successfully maintains, sometimes improves, the quality of one-time code generation, while significantly improving the self-refinement capability of code LMs. We implement four variants of CYCLE with varied numbers of parameters across 350M, 1B, 2B, and 3B, and the experiments show that CYCLE consistently boosts the code generation performance, by up to 63.5</jats:p>",
        "orkg_categories_flat": [
            "apps",
            "cycle",
            "exploring self-refine methods in large language models",
            "humaneval",
            "mbpp-s"
        ],
        "papers_with_code_categories_flat": [
            "code generation",
            "focus",
            "humaneval",
            "mbpp",
            "natural language processing",
            "transformers"
        ],
        "openalex_categories_flat": [
            "code (set theory)",
            "computer science",
            "model-driven software engineering techniques",
            "programming language",
            "set (abstract data type)",
            "software engineering research",
            "software testing and debugging techniques"
        ],
        "openaire_categories_flat": [
            "02 engineering and technology",
            "0202 electrical engineering, electronic engineering, information engineering",
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language",
            "computer science - software engineering",
            "software engineering"
        ]
    },
    {
        "title": "Verification and Refinement of Natural Language Explanations through LLM-Symbolic Theorem Proving",
        "doi": "10.18653/v1/2024.emnlp-main.172",
        "abstract": "Camera-ready for EMNLP 2024",
        "orkg_categories_flat": [
            "e-snli",
            "exp-refiner",
            "exploring self-refine methods in large language models",
            "qasc",
            "worldtree"
        ],
        "papers_with_code_categories_flat": [
            "automated theorem proving",
            "natural language inference"
        ],
        "openalex_categories_flat": [
            "algebra over a field",
            "archaeology",
            "automated theorem proving",
            "calculus (dental)",
            "computer science",
            "dentistry",
            "history",
            "mathematics",
            "medicine",
            "natural (archaeology)",
            "natural language",
            "natural language processing",
            "natural language processing techniques",
            "programming language",
            "pure mathematics",
            "semantic web and ontologies"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "DAQ: Density-Aware Post-Training Weight-Only Quantization For LLMs",
        "doi": "10.48550/arxiv.2410.12187",
        "abstract": "Large language models (LLMs) excel in various tasks but face deployment challenges due to hardware constraints. We propose density-aware post-training weight-only quantization (DAQ), which has two stages: 1) density-centric alignment, which identifies the center of high-density weights and centers the dynamic range on this point to align high-density weight regions with floating-point high-precision regions; 2) learnable dynamic range adjustment, which adjusts the dynamic range by optimizing quantization parameters (i.e., scale and zero-point) based on the impact of weights on the model output. Experiments on LLaMA and LLaMA-2 show that DAQ consistently outperforms the best baseline method, reducing perplexity loss by an average of 22.8% on LLaMA and 19.6% on LLaMA-2. Our code is available at https://github.com/LuoYingSong/DAQ.",
        "orkg_categories_flat": [
            "daq",
            "weight-only quantization methods for llms"
        ],
        "papers_with_code_categories_flat": [
            "align",
            "computer vision",
            "language models",
            "llama",
            "natural language processing",
            "quantization",
            "vision and language pre-trained models"
        ],
        "openalex_categories_flat": [
            "advanced image and video retrieval techniques",
            "algorithm",
            "computer science",
            "geography",
            "medical image segmentation techniques",
            "medical imaging techniques and applications",
            "meteorology",
            "quantization (signal processing)",
            "training (meteorology)"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "MobileQuant: Mobile-friendly Quantization for On-device Language Models",
        "doi": "10.48550/arxiv.2408.13933",
        "abstract": "Large language models (LLMs) have revolutionized language processing, delivering outstanding results across multiple applications. However, deploying LLMs on edge devices poses several challenges with respect to memory, energy, and compute costs, limiting their widespread use in devices such as mobile phones. A promising solution is to reduce the number of bits used to represent weights and activations. While existing works have found partial success at quantizing LLMs to lower bitwidths, e.g. 4-bit weights, quantizing activations beyond 16 bits often leads to large computational overheads due to poor on-device quantization support, or a considerable accuracy drop. Yet, 8-bit activations are very attractive for on-device deployment as they would enable LLMs to fully exploit mobile-friendly hardware, e.g. Neural Processing Units (NPUs). In this work, we make a first attempt to facilitate the on-device deployment of LLMs using integer-only quantization. We first investigate the limitations of existing quantization methods for on-device deployment, with a special focus on activation quantization. We then address these limitations by introducing a simple post-training quantization method, named MobileQuant, that extends previous weight equivalent transformation works by jointly optimizing the weight transformation and activation range parameters in an end-to-end manner. MobileQuant demonstrates superior capabilities over existing methods by 1) achieving near-lossless quantization on a wide range of LLM benchmarks, 2) reducing latency and energy consumption by 20\\%-50\\% compared to current on-device quantization strategies, 3) requiring limited compute budget, 4) being compatible with mobile-friendly compute units, e.g. NPU.",
        "orkg_categories_flat": [
            "mobilequant",
            "weight-only quantization methods for llms"
        ],
        "papers_with_code_categories_flat": [
            "focus",
            "natural language processing",
            "quantization",
            "transformers"
        ],
        "openalex_categories_flat": [
            "biology",
            "computer science",
            "computer vision",
            "ecology",
            "environmentally friendly",
            "mobile device",
            "quantization (signal processing)",
            "recommender systems and techniques",
            "speech and dialogue systems",
            "topic modeling",
            "world wide web"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "GWQ: Gradient-Aware Weight Quantization for Large Language Models",
        "doi": "10.48550/arxiv.2411.00850",
        "abstract": "Large language models (LLMs) show impressive performance in solving complex language tasks. However, its large number of parameters presents significant challenges for the deployment. So, compressing LLMs to low bits can enable to deploy on resource-constrained devices. To address this problem, we propose gradient-aware weight quantization (GWQ), the first quantization approach for low-bit weight quantization that leverages gradients to localize outliers, requiring only a minimal amount of calibration data for outlier detection. GWQ retains the top 1\\% outliers preferentially at FP16 precision, while the remaining non-outlier weights are stored in a low-bit. We widely evaluate GWQ on different task include language modeling, grounding detection, massive multitask language understanding and vision-language question and answering. Results show that models quantified by GWQ performs better than other quantization method. During quantization process, GWQ only need one calibration set to realize effective quant. Also, GWQ achieves 1.2x inference speedup in comparison to the original model and effectively reduces the inference memory.",
        "orkg_categories_flat": [
            "gwq",
            "weight-only quantization methods for llms"
        ],
        "papers_with_code_categories_flat": [
            "outlier detection",
            "quantization"
        ],
        "openalex_categories_flat": [
            "algorithm",
            "artificial intelligence",
            "computer science",
            "natural language processing techniques",
            "quantization (signal processing)",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache\n  Quantization",
        "doi": "10.48550/arxiv.2401.18079",
        "abstract": "LLMs are seeing growing use for applications which require large context windows, and with these large context windows KV cache activations surface as the dominant contributor to memory consumption during inference. Quantization is a promising approach for compressing KV cache activations; however, existing solutions fail to represent activations accurately in sub-4-bit precision. Our work, KVQuant, facilitates low precision KV cache quantization by incorporating several novel methods: (i) Per-Channel Key Quantization, where we adjust the dimension along which we quantize the Key activations to better match the distribution; (ii) Pre-RoPE Key Quantization, where we quantize Key activations before the rotary positional embedding to mitigate its impact on quantization; (iii) Non-Uniform KV Cache Quantization, where we derive per-layer sensitivity-weighted non-uniform datatypes that better represent the distributions; and (iv) Per-Vector Dense-and-Sparse Quantization, where we isolate outliers separately for each vector to minimize skews in quantization ranges. By applying our method to the LLaMA, Llama-2, Llama-3, and Mistral models, we achieve &lt; 0.1 perplexity degradation with 3-bit quantization on both Wikitext-2 and C4, outperforming existing approaches. Our method enables serving LLaMA-7B with a context length of up to 1 million on a single A100-80GB GPU and up to 10 million on an 8-GPU system. We develop custom CUDA kernels for KVQuant, showing that we can achieve up to ~1.7x speedups, compared to baseline fp16 matrix-vector multiplications, for the LLaMA-7B model.",
        "orkg_categories_flat": [
            "kv-cache quantization (kvq) methods for llms"
        ],
        "papers_with_code_categories_flat": [
            "quantization"
        ],
        "openalex_categories_flat": [
            "algorithm",
            "algorithms and data compression",
            "archaeology",
            "artificial intelligence",
            "cache",
            "computer science",
            "context (archaeology)",
            "history",
            "inference",
            "natural language processing techniques",
            "parallel computing",
            "quantization (signal processing)",
            "web data mining and analysis"
        ],
        "openaire_categories_flat": [
            "02 engineering and technology",
            "0202 electrical engineering, electronic engineering, information engineering",
            "0209 industrial biotechnology",
            "computer and information sciences",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "SPIQA: A Dataset for Multimodal Question Answering on Scientific Papers",
        "doi": "10.48550/arxiv.2407.09413",
        "abstract": "Seeking answers to questions within long scientific research articles is a crucial area of study that aids readers in quickly addressing their inquiries. However, existing question-answering (QA) datasets based on scientific papers are limited in scale and focus solely on textual content. We introduce SPIQA (Scientific Paper Image Question Answering), the first large-scale QA dataset specifically designed to interpret complex figures and tables within the context of scientific research articles across various domains of computer science. Leveraging the breadth of expertise and ability of multimodal large language models (MLLMs) to understand figures, we employ automatic and manual curation to create the dataset. We craft an information-seeking task on interleaved images and text that involves multiple images covering plots, charts, tables, schematic diagrams, and result visualizations. SPIQA comprises 270K questions divided into training, validation, and three different evaluation splits. Through extensive experiments with 12 prominent foundational models, we evaluate the ability of current multimodal systems to comprehend the nuanced aspects of research articles. Additionally, we propose a Chain-of-Thought (CoT) evaluation strategy with in-context retrieval that allows fine-grained, step-by-step assessment and improves model performance. We further explore the upper bounds of performance enhancement with additional textual information, highlighting its promising potential for future research and the dataset's impact on revolutionizing how we interact with scientific literature.",
        "orkg_categories_flat": [
            "computer science",
            "figure/table understanding",
            "scientific question answering"
        ],
        "papers_with_code_categories_flat": [
            "articles",
            "focus",
            "natural language processing",
            "question answering",
            "transformers",
            "visual question answering (vqa)"
        ],
        "openalex_categories_flat": [
            "advanced text analysis techniques",
            "computer science",
            "information retrieval",
            "natural language processing techniques",
            "question answering",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - computer vision and pattern recognition",
            "computer vision and pattern recognition"
        ]
    },
    {
        "title": "Artificial intelligence for literature reviews: opportunities and challenges",
        "doi": "10.1007/s10462-024-10902-3",
        "abstract": "I have added all the three tables to the review",
        "orkg_categories_flat": [
            "ai in systematic literature review",
            "any",
            "paper classification",
            "support vector machine"
        ],
        "papers_with_code_categories_flat": [
            "general",
            "optimization",
            "slr"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "automation",
            "best practice",
            "computer science",
            "computer security",
            "data science",
            "economics",
            "engineering",
            "explainable artificial intelligence (xai)",
            "field (mathematics)",
            "human–computer interaction",
            "law",
            "leverage (statistics)",
            "management",
            "management science",
            "mathematics",
            "mechanical engineering",
            "medline",
            "meta-analysis and systematic reviews",
            "political science",
            "pure mathematics",
            "software engineering",
            "systematic review",
            "topic modeling",
            "transparency (behavior)",
            "usability"
        ],
        "openaire_categories_flat": [
            "02 engineering and technology",
            "0202 electrical engineering, electronic engineering, information engineering",
            "03 medical and health sciences",
            "0302 clinical medicine",
            "05 social sciences",
            "0501 psychology and cognitive sciences",
            "0502 economics and business",
            "0503 education",
            "ai",
            "artificial intelligence",
            "artificial intelligence in education",
            "artificial intelligence, cloud engineering, resource allocation, decision systems, optimization",
            "artificial intelligence; evaluation framework; large language models; literature review; natural anguage processing; systematic literature reviews; usability;",
            "assessment",
            "biomedical",
            "challenges",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - human-computer interaction",
            "computer science - information retrieval",
            "design",
            "e-government",
            "education",
            "electronic computers. computer science",
            "elementary education",
            "emergency department",
            "engineering",
            "government services",
            "h",
            "human-computer interaction",
            "humans",
            "information retrieval",
            "l",
            "learning",
            "literature review",
            "machine learning",
            "medical technology",
            "mental health technology",
            "opportunities",
            "personalization",
            "personalized psychiatry",
            "psychiatric diagnosis",
            "psychiatry",
            "qa75.5-76.95",
            "r855-855.5",
            "rc435-571",
            "science & technology",
            "sensors",
            "social sciences",
            "systematic literature review",
            "systematic review",
            "teaching",
            "technology",
            "voice disorders"
        ]
    },
    {
        "title": "SciDQA: A Deep Reading Comprehension Dataset over Scientific Papers",
        "doi": "10.18653/v1/2024.emnlp-main.1163",
        "abstract": "18 pages, Accepted to EMNLP 2024",
        "orkg_categories_flat": [
            "deep reading and multi-document qa",
            "scientific papers",
            "scientific question answering"
        ],
        "papers_with_code_categories_flat": [
            "articles",
            "question answering",
            "reading comprehension"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "comprehension",
            "computer science",
            "data science",
            "information retrieval",
            "linguistics",
            "natural language processing",
            "natural language processing techniques",
            "philosophy",
            "programming language",
            "reading (process)",
            "reading comprehension",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling",
        "doi": "10.48550/arxiv.2304.01373",
        "abstract": "Code at https://github.com/EleutherAI/pythia",
        "orkg_categories_flat": [
            "causal language modeling",
            "large language models (llms)",
            "transformer model"
        ],
        "papers_with_code_categories_flat": [
            "common sense reasoning",
            "coreference resolution",
            "language modelling",
            "language models",
            "memorization",
            "natural language processing",
            "pythia",
            "question answering"
        ],
        "openalex_categories_flat": [
            "archaeology",
            "artificial intelligence",
            "code (set theory)",
            "computer science",
            "data science",
            "geometry",
            "history",
            "machine learning",
            "mathematics",
            "mathematics education",
            "memorization",
            "meteorology",
            "natural language processing techniques",
            "physics",
            "programming language",
            "psychology",
            "quantum mechanics",
            "regression analysis",
            "scale (ratio)",
            "scaling",
            "set (abstract data type)",
            "suite",
            "test case",
            "test suite",
            "topic modeling",
            "training (meteorology)",
            "training set"
        ],
        "openaire_categories_flat": [
            "004",
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "UL2: Unifying Language Learning Paradigms",
        "doi": "10.48550/arxiv.2205.05131",
        "abstract": "Existing pre-trained models are generally geared towards a particular class of problems. To date, there seems to be still no consensus on what the right architecture and pre-training setup should be. This paper presents a unified framework for pre-training models that are universally effective across datasets and setups. We begin by disentangling architectural archetypes with pre-training objectives -- two concepts that are commonly conflated. Next, we present a generalized &amp; unified perspective for self-supervision in NLP and show how different pre-training objectives can be cast as one another and how interpolating between different objectives can be effective. We then propose Mixture-of-Denoisers (MoD), a pre-training objective that combines diverse pre-training paradigms together. We furthermore introduce a notion of mode switching, wherein downstream fine-tuning is associated with specific pre-training schemes. We conduct extensive ablative experiments to compare multiple pre-training objectives and find that our method pushes the Pareto-frontier by outperforming T5 &amp; GPT-like models across multiple diverse setups. By scaling our model up to 20B parameters, we achieve SOTA performance on 50 well-established supervised finetuning based NLP tasks. Our model also achieve strong results at in-context learning, outperforming 175B GPT-3 on zero-shot SuperGLUE and tripling the performance of T5-XXL on one-shot summarization. On 0-shot MMLU, UL2 20B outperforms T0 and T5 models. UL2 20B also works well with chain-of-thought prompting and reasoning, making it an appealing choice for research into reasoning at a small to medium scale of 20B parameters. Finally, we apply FLAN instruction tuning to the UL2 20B model, achieving MMLU and Big-Bench scores competitive to FLAN-PaLM 62B. We release Flax-based T5X checkpoints for the UL2 20B &amp; Flan-UL2 20B.",
        "orkg_categories_flat": [
            "large language models (llms)",
            "mixture-of-denoisers, which combines diverse pretraining paradigms together",
            "transformer model"
        ],
        "papers_with_code_categories_flat": [
            "15 ways to contact how can i speak to someone at delta airlines",
            "activation functions",
            "adafactor",
            "adam",
            "arithmetic reasoning",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "attention patterns",
            "bpe",
            "common sense reasoning",
            "coreference resolution",
            "cosine annealing",
            "dense connections",
            "dropout",
            "feedforward networks",
            "fixed factorized attention",
            "gelu",
            "general",
            "glu",
            "gpt-3",
            "in-context learning",
            "information retrieval",
            "inverse square root schedule",
            "language models",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with cosine annealing",
            "long-range modeling",
            "mmlu",
            "multi-head attention",
            "multi-task language understanding",
            "natural language inference",
            "natural language processing",
            "normalization",
            "output functions",
            "question answering",
            "regularization",
            "residual connection",
            "retrieval",
            "sentencepiece",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "t5",
            "text classification",
            "text generation",
            "tokenizers",
            "transformers",
            "ul2",
            "weight decay",
            "word sense disambiguation"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "automatic summarization",
            "biology",
            "class (philosophy)",
            "computer science",
            "context (archaeology)",
            "database",
            "economics",
            "language model",
            "machine learning",
            "multimodal machine learning applications",
            "natural language processing",
            "natural language processing techniques",
            "operations management",
            "paleontology",
            "pareto principle",
            "perspective (graphical)",
            "scalability",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "01 natural sciences",
            "0105 earth and related environmental sciences",
            "02 engineering and technology",
            "0202 electrical engineering, electronic engineering, information engineering",
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
        "doi": "10.48550/arxiv.2307.09288",
        "abstract": "In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.",
        "orkg_categories_flat": [
            "large language models (llms)",
            "self-supervized learning",
            "transformer model"
        ],
        "papers_with_code_categories_flat": [
            "",
            "absolute position encodings",
            "activation functions",
            "adamw",
            "arithmetic reasoning",
            "attention",
            "attention mechanisms",
            "bpe",
            "code generation",
            "dense connections",
            "dropout",
            "entropy regularization",
            "feedforward network",
            "feedforward networks",
            "general",
            "grouped-query attention",
            "label smoothing",
            "math word problem solving",
            "multi-task language understanding",
            "multiple choice question answering (mcqa)",
            "natural language processing",
            "normalization",
            "output functions",
            "policy gradient methods",
            "position embeddings",
            "ppo",
            "question answering",
            "regularization",
            "reinforcement learning",
            "residual connection",
            "rmsnorm",
            "rotary embeddings",
            "sentence completion",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "swiglu",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "archaeology",
            "biology",
            "cartography",
            "competition (biology)",
            "computer science",
            "ecology",
            "engineering",
            "foundation (evidence)",
            "geography",
            "helpfulness",
            "mechanical engineering",
            "natural language processing techniques",
            "psychology",
            "scale (ratio)",
            "social psychology",
            "speech and dialogue systems",
            "topic modeling",
            "work (physics)"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "Have LLMs Advanced Enough? A Challenging Problem Solving Benchmark For Large Language Models",
        "doi": "10.18653/v1/2023.emnlp-main.468",
        "abstract": "The performance of large language models (LLMs) on existing reasoning benchmarks has significantly improved over the past years. In response, we present JEEBench, a considerably more challenging benchmark dataset for evaluating the problem solving abilities of LLMs. We curate 515 challenging pre-engineering mathematics, physics and chemistry problems from the highly competitive IIT JEE-Advanced exam. Long-horizon reasoning on top of deep in-domain knowledge is essential for solving problems in this benchmark. Our evaluation on various open-source and proprietary models reveals that the highest performance, even after using techniques like self-consistency, self-refinement and chain-of-thought prompting, is less than 40%. The typical failure modes of GPT-4, the best model, are errors in algebraic manipulation, difficulty in grounding abstract concepts into mathematical equations accurately and failure in retrieving relevant domain-specific concepts. We also observe that by mere prompting, GPT-4 is unable to assess risk introduced by negative marking for incorrect answers. For this, we develop a post-hoc confidence-thresholding method over self-consistency, which enables effective response selection. We hope that our challenging benchmark will guide future re-search in problem-solving using LLMs.",
        "orkg_categories_flat": [
            "large language models in general science",
            "scientific problem solving benchmark to evaluate the problem solving abilities of large-language-models"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "activation functions",
            "adam",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "cosine annealing",
            "dense connections",
            "discriminative fine-tuning",
            "dropout",
            "feedforward networks",
            "fine-tuning",
            "gelu",
            "general",
            "gpt",
            "gpt-4",
            "label smoothing",
            "language models",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with cosine annealing",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "overall - test",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers",
            "weight decay"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "benchmark (surveying)",
            "computer science",
            "consistency (knowledge bases)",
            "domain (mathematical analysis)",
            "engineering",
            "geodesy",
            "geography",
            "management science",
            "mathematical analysis",
            "mathematics",
            "natural language processing techniques",
            "software engineering research",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "ChemCrow: Augmenting large-language models with chemistry tools",
        "doi": "10.48550/arxiv.2304.05376",
        "abstract": "<jats:title>Abstract</jats:title><jats:p>Large language models (LLMs) have shown strong performance in tasks across domains but struggle with chemistry-related problems. These models also lack access to external knowledge sources, limiting their usefulness in scientific applications. We introduce ChemCrow, an LLM chemistry agent designed to accomplish tasks across organic synthesis, drug discovery and materials design. By integrating 18 expert-designed tools and using GPT-4 as the LLM, ChemCrow augments the LLM performance in chemistry, and new capabilities emerge. Our agent autonomously planned and executed the syntheses of an insect repellent and three organocatalysts and guided the discovery of a novel chromophore. Our evaluation, including both LLM and expert assessments, demonstrates ChemCrow’s effectiveness in automating a diverse set of chemical tasks. Our work not only aids expert chemists and lowers barriers for non-experts but also fosters scientific advancement by bridging the gap between experimental and computational chemistry.</jats:p>",
        "orkg_categories_flat": [
            "a novel llm-powered method for integrating computational tools in chemistry",
            "large language models in general science"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "computational chemistry",
            "dense connections",
            "dropout",
            "drug discovery",
            "feedforward networks",
            "general",
            "label smoothing",
            "layer normalization",
            "linear layer",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "biochemistry",
            "bridging (networking)",
            "chemistry",
            "computer network",
            "computer science",
            "data science",
            "drug discovery",
            "engineering",
            "limiting",
            "machine learning in materials science",
            "materials science",
            "mechanical engineering",
            "nanotechnology",
            "programming language",
            "set (abstract data type)"
        ],
        "openaire_categories_flat": [
            "03 medical and health sciences",
            "0301 basic medicine",
            "0303 health sciences",
            "article",
            "chemical physics",
            "computer and information sciences",
            "machine learning",
            "physical sciences",
            "physics - chemical physics",
            "statistics - machine learning"
        ]
    },
    {
        "title": "WizardCoder: Empowering Code Large Language Models with Evol-Instruct",
        "doi": "10.48550/arxiv.2306.08568",
        "abstract": "Code Large Language Models (Code LLMs), such as StarCoder, have demonstrated exceptional performance in code-related tasks. However, most existing models are solely pre-trained on extensive raw code data without instruction fine-tuning. In this paper, we introduce WizardCoder, which empowers Code LLMs with complex instruction fine-tuning, by adapting the Evol-Instruct method to the domain of code. Through comprehensive experiments on four prominent code generation benchmarks, namely HumanEval, HumanEval+, MBPP, and DS-1000, we unveil the exceptional capabilities of our model. It surpasses all other open-source Code LLMs by a substantial margin. Moreover, our model even outperforms the largest closed LLMs, Anthropic's Claude and Google's Bard, on HumanEval and HumanEval+. Our code, model weights, and data are public at https://github.com/nlpxucan/WizardLM",
        "orkg_categories_flat": [
            "a>>aaaaaaaaaaatransformer model",
            "large language models (llms)"
        ],
        "papers_with_code_categories_flat": [
            "code generation",
            "humaneval",
            "mbpp"
        ],
        "openalex_categories_flat": [
            "code (set theory)",
            "computer science",
            "machine learning",
            "margin (machine learning)",
            "natural language processing techniques",
            "programming language",
            "set (abstract data type)",
            "software engineering research",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "StarCoder: may the source be with you!",
        "doi": "10.48550/arxiv.2305.06161",
        "abstract": "The BigCode community, an open-scientific collaboration working on the responsible development of Large Language Models for Code (Code LLMs), introduces StarCoder and StarCoderBase: 15.5B parameter models with 8K context length, infilling capabilities and fast large-batch inference enabled by multi-query attention. StarCoderBase is trained on 1 trillion tokens sourced from The Stack, a large collection of permissively licensed GitHub repositories with inspection tools and an opt-out process. We fine-tuned StarCoderBase on 35B Python tokens, resulting in the creation of StarCoder. We perform the most comprehensive evaluation of Code LLMs to date and show that StarCoderBase outperforms every open Code LLM that supports multiple programming languages and matches or outperforms the OpenAI code-cushman-001 model. Furthermore, StarCoder outperforms every model that is fine-tuned on Python, can be prompted to achieve 40\\% pass@1 on HumanEval, and still retains its performance on other programming languages. We take several important steps towards a safe open-access model release, including an improved PII redaction pipeline and a novel attribution tracing tool, and make the StarCoder models publicly available under a more commercially viable version of the Open Responsible AI Model license.",
        "orkg_categories_flat": [
            "large language models (llms)",
            "transformer model"
        ],
        "papers_with_code_categories_flat": [
            "8k",
            "code generation",
            "humaneval",
            "pii redaction"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "biology",
            "computer science",
            "context (archaeology)",
            "inference",
            "license",
            "machine learning and data classification",
            "mit license",
            "open source",
            "operating system",
            "paleontology",
            "programming language",
            "python (programming language)",
            "software",
            "software engineering",
            "software engineering research",
            "source code",
            "topic modeling",
            "tracing",
            "world wide web"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - programming languages",
            "computer science - software engineering",
            "programming languages",
            "software engineering"
        ]
    },
    {
        "title": "WizardLM: Empowering Large Language Models to Follow Complex Instructions",
        "doi": "10.48550/arxiv.2304.12244",
        "abstract": "Training large language models (LLMs) with open-domain instruction following data brings colossal success. However, manually creating such instruction data is very time-consuming and labor-intensive. Moreover, humans may struggle to produce high-complexity instructions. In this paper, we show an avenue for creating large amounts of instruction data with varying levels of complexity using LLM instead of humans. Starting with an initial set of instructions, we use our proposed Evol-Instruct to rewrite them step by step into more complex instructions. Then, we mix all generated instruction data to fine-tune LLaMA. We call the resulting model WizardLM. Human evaluations on a complexity-balanced test bed and Vicuna's testset show that instructions from Evol-Instruct are superior to human-created ones. By analyzing the human evaluation results of the high complexity part, we demonstrate that outputs from our WizardLM are preferred to outputs from OpenAI ChatGPT. In GPT-4 automatic evaluation, WizardLM achieves more than 90\\% capacity of ChatGPT on 17 out of 29 skills. Even though WizardLM still lags behind ChatGPT in some aspects, our findings suggest that fine-tuning with AI-evolved instructions is a promising direction for enhancing LLMs. Our code and data are public at https://github.com/nlpxucan/WizardLM",
        "orkg_categories_flat": [
            "large language models (llms)",
            "supervized open-domain complex instruction finetuning",
            "transformer model"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "active learning",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "general",
            "gpt-4",
            "instruction following",
            "label smoothing",
            "language models",
            "layer normalization",
            "linear layer",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "test",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "biology",
            "code (set theory)",
            "computer science",
            "domain (mathematical analysis)",
            "factor (programming language)",
            "human–computer interaction",
            "language model",
            "machine learning and data classification",
            "mathematical analysis",
            "mathematics",
            "natural language processing techniques",
            "paleontology",
            "programming language",
            "set (abstract data type)",
            "test (biology)",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "WizardMath: Empowering Mathematical Reasoning for Large Language Models via Reinforced Evol-Instruct",
        "doi": "10.48550/arxiv.2308.09583",
        "abstract": "Large language models (LLMs), such as GPT-4, have shown remarkable performance in natural language processing (NLP) tasks, including challenging mathematical reasoning. However, most existing open-source models are only pre-trained on large-scale internet data and without math-related optimization. In this paper, we present WizardMath, which enhances the mathematical CoT reasoning abilities of LLMs without using external python tools, by applying our proposed Reinforcement Learning from Evol-Instruct Feedback (RLEIF) method to the domain of math. Through extensive experiments on two mathematical reasoning benchmarks, namely GSM8k and MATH, we reveal the extraordinary capabilities of our model. Remarkably, WizardMath-Mistral 7B surpasses top-tier open-source LLMs by a substantial margin with higher data efficiency. Furthermore, WizardMath 70B even outperforms GPT-3.5-Turbo, Claude 2, Gemini Pro and GPT-4-early-version. Additionally, our preliminary exploration highlights the pivotal role of instruction evolution and process supervision in achieving exceptional math performance. For more details refer to https://github.com/nlpxucan/WizardLM",
        "orkg_categories_flat": [
            "ai in mathematics",
            "reinforcement learning from evol-instruct feedback (rleif)"
        ],
        "papers_with_code_categories_flat": [
            "15 ways to contact how can i speak to someone at delta airlines",
            "absolute position encodings",
            "activation functions",
            "adam",
            "arithmetic reasoning",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "attention patterns",
            "bpe",
            "cosine annealing",
            "dense connections",
            "dropout",
            "feedforward networks",
            "fixed factorized attention",
            "gelu",
            "general",
            "gpt-3",
            "gpt-4",
            "gsm8k",
            "label smoothing",
            "language models",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with cosine annealing",
            "math",
            "math word problem solving",
            "mathematical reasoning",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers",
            "weight decay"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "computer science",
            "domain (mathematical analysis)",
            "language of mathematics",
            "machine learning",
            "machine learning and data classification",
            "margin (machine learning)",
            "mathematical analysis",
            "mathematics",
            "mathematics education",
            "natural language processing techniques",
            "open source",
            "programming language",
            "software",
            "the internet",
            "topic modeling",
            "world wide web"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open Generative Large Language Models",
        "doi": "10.48550/arxiv.2308.16149",
        "abstract": "Arabic-centric, foundation model, large-language model, LLM, generative model, instruction-tuned, Jais, Jais-chat",
        "orkg_categories_flat": [
            "causal language modeling",
            "instruction tuning",
            "large language models (llms)",
            "transformer model"
        ],
        "papers_with_code_categories_flat": [
            "15 ways to contact how can i speak to someone at delta airlines",
            "activation functions",
            "adam",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "attention patterns",
            "bpe",
            "cosine annealing",
            "decoder",
            "dense connections",
            "dropout",
            "feedforward networks",
            "fixed factorized attention",
            "gelu",
            "general",
            "gpt-3",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with cosine annealing",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "regularization",
            "residual connection",
            "safety alignment",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformers",
            "weight decay"
        ],
        "openalex_categories_flat": [
            "arabic",
            "archaeology",
            "artificial intelligence",
            "code (set theory)",
            "computer science",
            "foundation (evidence)",
            "generative grammar",
            "history",
            "linguistics",
            "machine learning",
            "machine learning in healthcare",
            "margin (machine learning)",
            "natural language processing",
            "natural language processing techniques",
            "open source",
            "philosophy",
            "programming language",
            "set (abstract data type)",
            "software",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "68t50",
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - machine learning",
            "f.2.2; i.2.7",
            "machine learning"
        ]
    },
    {
        "title": "Orca: Progressive Learning from Complex Explanation Traces of GPT-4",
        "doi": "10.48550/arxiv.2306.02707",
        "abstract": "Recent research has focused on enhancing the capability of smaller models through imitation learning, drawing on the outputs generated by large foundation models (LFMs). A number of issues impact the quality of these models, ranging from limited imitation signals from shallow LFM outputs; small scale homogeneous training data; and most notably a lack of rigorous evaluation resulting in overestimating the small model's capability as they tend to learn to imitate the style, but not the reasoning process of LFMs. To address these challenges, we develop Orca (We are working with our legal team to publicly release a diff of the model weights in accordance with LLaMA's release policy to be published at https://aka.ms/orca-lm), a 13-billion parameter model that learns to imitate the reasoning process of LFMs. Orca learns from rich signals from GPT-4 including explanation traces; step-by-step thought processes; and other complex instructions, guided by teacher assistance from ChatGPT. To promote this progressive learning, we tap into large-scale and diverse imitation data with judicious sampling and selection. Orca surpasses conventional state-of-the-art instruction-tuned models such as Vicuna-13B by more than 100% in complex zero-shot reasoning benchmarks like Big-Bench Hard (BBH) and 42% on AGIEval. Moreover, Orca reaches parity with ChatGPT on the BBH benchmark and shows competitive performance (4 pts gap with optimized system message) in professional and academic examinations like the SAT, LSAT, GRE, and GMAT, both in zero-shot settings without CoT; while trailing behind GPT-4. Our research indicates that learning from step-by-step explanations, whether these are generated by humans or more advanced AI models, is a promising direction to improve model capabilities and skills.",
        "orkg_categories_flat": [
            "explanation tuning",
            "large language models (llms)",
            "transformer model"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "general",
            "gpt-4",
            "imitation learning",
            "knowledge distillation",
            "label smoothing",
            "language models",
            "layer normalization",
            "linear layer",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "benchmark (surveying)",
            "computer science",
            "explainable artificial intelligence (xai)",
            "geodesy",
            "geography",
            "imitation",
            "machine learning",
            "machine learning in healthcare",
            "operating system",
            "physics",
            "process (computing)",
            "psychology",
            "quantum mechanics",
            "scale (ratio)",
            "social psychology",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "CMATH: Can Your Language Model Pass Chinese Elementary School Math Test?",
        "doi": "10.48550/arxiv.2306.16636",
        "abstract": "We present the Chinese Elementary School Math Word Problems (CMATH) dataset, comprising 1.7k elementary school-level math word problems with detailed annotations, source from actual Chinese workbooks and exams. This dataset aims to provide a benchmark tool for assessing the following question: to what grade level of elementary school math do the abilities of popular large language models (LLMs) correspond? We evaluate a variety of popular LLMs, including both commercial and open-source options, and discover that only GPT-4 achieves success (accuracy $\\geq$ 60\\%) across all six elementary school grades, while other models falter at different grade levels. Furthermore, we assess the robustness of several top-performing LLMs by augmenting the original problems in the CMATH dataset with distracting information. Our findings reveal that GPT-4 is able to maintains robustness, while other model fail. We anticipate that our study will expose limitations in LLMs' arithmetic and reasoning capabilities, and promote their ongoing development and advancement.",
        "orkg_categories_flat": [
            "assess the robustness of several top-performing llms by augmenting the original problems in the cmath dataset with distracting information",
            "exploring language models in mathematics"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "dense connections",
            "dropout",
            "fail",
            "feedforward networks",
            "general",
            "gpt-4",
            "label smoothing",
            "language modeling",
            "language modelling",
            "language models",
            "layer normalization",
            "linear layer",
            "math",
            "math word problem solving",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "benchmark (surveying)",
            "biochemistry",
            "cartography",
            "chemistry",
            "computer science",
            "elementary mathematics",
            "gene",
            "geography",
            "mathematics education",
            "mathematics, computing, and information processing",
            "natural language processing techniques",
            "open source",
            "programming language",
            "psychology",
            "robustness (evolution)",
            "software",
            "text readability and simplification"
        ],
        "openaire_categories_flat": [
            "4. education",
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - machine learning",
            "machine learning"
        ]
    },
    {
        "title": "MEGA: Multilingual Evaluation of Generative AI",
        "doi": "10.18653/v1/2023.emnlp-main.258",
        "abstract": "Generative AI models have shown impressive performance on many Natural Language Processing tasks such as language understanding, reasoning, and language generation. An important question being asked by the AI community today is about the capabilities and limits of these models, and it is clear that evaluating generative AI is very challenging. Most studies on generative LLMs have been restricted to English and it is unclear how capable these models are at understanding and generating text in other languages. We present the first comprehensive benchmarking of generative LLMs - MEGA, which evaluates models on standard NLP benchmarks, covering 16 NLP datasets across 70 typologically diverse languages. We compare the performance of generative LLMs including Chat-GPT and GPT-4 to State of the Art (SOTA) non-autoregressive models on these tasks to determine how well generative models perform compared to the previous generation of LLMs. We present a thorough analysis of the performance of models across languages and tasks and discuss challenges in improving the performance of generative LLMs on low-resource languages. We create a framework for evaluating generative LLMs in the multilingual setting and provide directions for future progress in the field.",
        "orkg_categories_flat": [
            "benchmarking of generative llms - mega, which evaluates models on standard nlp benchmarks",
            "commonsense reasoning",
            "exploring large language models for multilingual tasks",
            "natural language inference",
            "question answering",
            "sequence-labelling with blstm",
            "summarization"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "benchmarking",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "general",
            "gpt-4",
            "label smoothing",
            "language models",
            "layer normalization",
            "linear layer",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "astronomy",
            "computer science",
            "generative grammar",
            "mega-",
            "natural language processing",
            "natural language processing techniques",
            "physics"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "ChatGPT Beyond English: Towards a Comprehensive Evaluation of Large Language Models in Multilingual Learning",
        "doi": "10.18653/v1/2023.findings-emnlp.878",
        "abstract": "Over the last few years, large language models (LLMs) have emerged as the most important breakthroughs in natural language processing (NLP) that fundamentally transform research and developments in the field. ChatGPT represents one of the most exciting LLM systems developed recently to showcase impressive skills for language generation and highly attract public attention. Among various exciting applications discovered for ChatGPT in English, the model can process and generate texts for multiple languages due to its multilingual training data. Given the broad adoption of ChatGPT for English in different problems and areas, a natural question is whether ChatGPT can also be applied effectively for other languages or it is necessary to develop more language-specific technologies. The answer to this question requires a thorough evaluation of ChatGPT over multiple tasks with diverse languages and large datasets (i.e., beyond reported anecdotes), which is still missing or limited in current research. Our work aims to fill this gap for the evaluation of ChatGPT and similar LLMs to provide more comprehensive information for multilingual NLP applications. While this work will be an ongoing effort to include additional experiments in the future, our current paper evaluates ChatGPT on 7 different tasks, covering 37 diverse languages with high, medium, low, and extremely low resources. We also focus on the zero-shot learning setting for ChatGPT to improve reproducibility and better simulate the interactions of general users. Compared to the performance of previous models, our extensive experimental results demonstrate a worse performance of ChatGPT for different NLP tasks and languages, calling for further research to develop better models and understanding for multilingual learning.",
        "orkg_categories_flat": [
            "common sense reasoning (csr)",
            "exploring large language models for multilingual tasks",
            "named entity recognition (ner)",
            "natural language inference (nli)",
            "part of speech (pos) tagging",
            "question answering (qa)",
            "relation classification",
            "summarization",
            "to evaluate the performance of chatgpt and llms for nlp tasks in different languages"
        ],
        "papers_with_code_categories_flat": [
            "multilingual nlp",
            "text generation",
            "zero-shot learning"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "artificial intelligence in healthcare and education",
            "computer science",
            "data science",
            "field (mathematics)",
            "mathematics",
            "natural language",
            "natural language processing",
            "operating system",
            "process (computing)",
            "pure mathematics",
            "text readability and simplification",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "M3Exam: A Multilingual, Multimodal, Multilevel Benchmark for Examining Large Language Models",
        "doi": "10.48550/arxiv.2306.05179",
        "abstract": "Despite the existence of various benchmarks for evaluating natural language processing models, we argue that human exams are a more suitable means of evaluating general intelligence for large language models (LLMs), as they inherently demand a much wider range of abilities such as language understanding, domain knowledge, and problem-solving skills. To this end, we introduce M3Exam, a novel benchmark sourced from real and official human exam questions for evaluating LLMs in a multilingual, multimodal, and multilevel context. M3Exam exhibits three unique characteristics: (1) multilingualism, encompassing questions from multiple countries that require strong multilingual proficiency and cultural knowledge; (2) multimodality, accounting for the multimodal nature of many exam questions to test the model's multimodal understanding capability; and (3) multilevel structure, featuring exams from three critical educational periods to comprehensively assess a model's proficiency at different levels. In total, M3Exam contains 12,317 questions in 9 diverse languages with three educational levels, where about 23\\% of the questions require processing images for successful solving. We assess the performance of top-performing LLMs on M3Exam and find that current models, including GPT-4, still struggle with multilingual text, particularly in low-resource and non-Latin script languages. Multimodal LLMs also perform poorly with complex multimodal questions. We believe that M3Exam can be a valuable resource for comprehensively evaluating LLMs by examining their multilingual and multimodal abilities and tracking their development. Data and evaluation code is available at \\url{https://github.com/DAMO-NLP-SG/M3Exam}.",
        "orkg_categories_flat": [
            "exploring evaluation benchmarks for large language models",
            "proposed a novel benchmark dataset for evaluating llms by offering a multilingual, multimodal, and multi-level assessment",
            "specific downstream task"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "general",
            "gpt-4",
            "label smoothing",
            "language models",
            "layer normalization",
            "linear layer",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "archaeology",
            "artificial intelligence",
            "benchmark (surveying)",
            "computer science",
            "context (archaeology)",
            "geodesy",
            "geography",
            "multilingualism",
            "multimodality",
            "natural language processing",
            "natural language processing techniques",
            "pedagogy",
            "psychology",
            "text readability and simplification",
            "topic modeling",
            "world wide web"
        ],
        "openaire_categories_flat": [
            "4. education",
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language",
            "computer science - computer vision and pattern recognition",
            "computer vision and pattern recognition"
        ]
    },
    {
        "title": "Measuring Massive Multitask Chinese Understanding",
        "doi": "10.48550/arxiv.2304.12986",
        "abstract": "The development of large-scale Chinese language models is flourishing, yet there is a lack of corresponding capability assessments. Therefore, we propose a test to measure the multitask accuracy of large Chinese language models. This test encompasses four major domains, including medicine, law, psychology, and education, with 15 subtasks in medicine and 8 subtasks in education. We found that the best-performing models in the zero-shot setting outperformed the worst-performing models by nearly 18.6 percentage points on average. Across the four major domains, the highest average zero-shot accuracy of all models is 0.512. In the subdomains, only the GPT-3.5-turbo model achieved a zero-shot accuracy of 0.693 in clinical medicine, which was the highest accuracy among all models across all subtasks. All models performed poorly in the legal domain, with the highest zero-shot accuracy reaching only 0.239. By comprehensively evaluating the breadth and depth of knowledge across multiple disciplines, this test can more accurately identify the shortcomings of the models.",
        "orkg_categories_flat": [
            "exploring large language models for multilingual tasks"
        ],
        "papers_with_code_categories_flat": [
            "15 ways to contact how can i speak to someone at delta airlines",
            "activation functions",
            "active learning",
            "adam",
            "all",
            "attention",
            "attention dropout",
            "attention mechanisms",
            "attention modules",
            "attention patterns",
            "bpe",
            "cosine annealing",
            "dense connections",
            "dropout",
            "feedforward networks",
            "fixed factorized attention",
            "gelu",
            "general",
            "gpt-3",
            "layer normalization",
            "learning rate schedules",
            "linear layer",
            "linear warmup with cosine annealing",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "test",
            "transformers",
            "weight decay"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "biology",
            "cartography",
            "chemistry",
            "computer science",
            "explainable artificial intelligence (xai)",
            "flourishing",
            "geography",
            "linguistics",
            "machine learning",
            "machine learning in healthcare",
            "natural language processing",
            "organic chemistry",
            "paleontology",
            "philosophy",
            "psychology",
            "radiomics and machine learning in medical imaging",
            "scale (ratio)",
            "shot (pellet)",
            "social psychology",
            "test (biology)",
            "zero (linguistics)"
        ],
        "openaire_categories_flat": [
            "3. good health",
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language"
        ]
    },
    {
        "title": "Evaluating language models for mathematics through interactions",
        "doi": "10.1073/pnas.2318124121",
        "abstract": "<jats:p>There is much excitement about the opportunity to harness the power of large language models (LLMs) when building problem-solving assistants. However, the standard methodology of evaluating LLMs relies on static pairs of inputs and outputs; this is insufficient for making an informed decision about which LLMs are best to use in an interactive setting, and how that varies by setting. Static assessment therefore limits how we understand language model capabilities. We introduce CheckMate, an adaptable prototype platform for humans to interact with and evaluate LLMs. We conduct a study with CheckMate to evaluate three language models (InstructGPT, ChatGPT, and GPT-4) as assistants in proving undergraduate-level mathematics, with a mixed cohort of participants from undergraduate students to professors of mathematics. We release the resulting interaction and rating dataset, MathConverse. By analyzing MathConverse, we derive a taxonomy of human query behaviors and uncover that despite a generally positive correlation, there are notable instances of divergence between correctness and perceived helpfulness in LLM generations, among other findings. Further, we garner a more granular understanding of GPT-4 mathematical problem-solving through a series of case studies, contributed by experienced mathematicians. We conclude with actionable takeaways for ML practitioners and mathematicians: models that communicate uncertainty, respond well to user corrections, and can provide a concise rationale for their recommendations, may constitute better assistants. Humans should inspect LLM output carefully given their current shortcomings and potential for surprising fallibility.</jats:p>",
        "orkg_categories_flat": [
            "exploring language models in mathematics",
            "interactive and dynamic evaluation of llms in mathematics"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "aware",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "general",
            "gpt-4",
            "graph representation learning",
            "graphs",
            "label smoothing",
            "language modelling",
            "language models",
            "layer normalization",
            "linear layer",
            "mathematical problem-solving",
            "mathematical reasoning",
            "multi-head attention",
            "natural language processing",
            "navigate",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "computer science",
            "correctness",
            "divergence (linguistics)",
            "engineering",
            "explainable artificial intelligence (xai)",
            "helpfulness",
            "linguistics",
            "management science",
            "mathematics education",
            "philosophy",
            "programming language",
            "psychology",
            "readability",
            "social psychology",
            "software engineering research",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "administración de proyectos culturales",
            "administración pública",
            "ai",
            "bienes públicos",
            "bilingual",
            "collaborative",
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language",
            "computer science - human-computer interaction",
            "computer science - machine learning",
            "cooperative learning classrooms",
            "crime",
            "crimen",
            "educación",
            "educación y crimen",
            "education",
            "human-computer interaction",
            "humans",
            "human–computer interaction",
            "language",
            "language models",
            "machine learning",
            "mathematics",
            "physical sciences",
            "política cultural-impacto social",
            "política pública",
            "política pública-aspectos sociales",
            "practices",
            "prevención del delito",
            "problem solving",
            "public goods",
            "public policy",
            "students",
            "theorem proving"
        ]
    },
    {
        "title": "How well do Large Language Models perform in Arithmetic tasks?",
        "doi": "10.48550/arxiv.2304.02015",
        "abstract": "Large language models have emerged abilities including chain-of-thought to answer math word problems step by step. Solving math word problems not only requires abilities to disassemble problems via chain-of-thought but also needs to calculate arithmetic expressions correctly for each step. To the best of our knowledge, there is no work to focus on evaluating the arithmetic ability of large language models. In this work, we propose an arithmetic dataset MATH 401 to test the latest large language models including GPT-4, ChatGPT, InstrctGPT, Galactica, and LLaMA with various arithmetic expressions and provide a detailed analysis of the ability of large language models. MATH 401 and evaluation codes are released at \\url{https://github.com/GanjinZero/math401-llm}.",
        "orkg_categories_flat": [
            "exploring language models in mathematics",
            "propose an arithmetic dataset math 401 to test latest large language models"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "active learning",
            "adam",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "dense connections",
            "dropout",
            "feedforward networks",
            "galactica",
            "general",
            "gpt-4",
            "label smoothing",
            "language models",
            "layer normalization",
            "linear layer",
            "math",
            "multi-head attention",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "test",
            "transformer",
            "transformers"
        ],
        "openalex_categories_flat": [
            "arithmetic",
            "computer science",
            "focus (optics)",
            "linguistics",
            "mathematics",
            "natural language processing",
            "optics",
            "philosophy",
            "physics",
            "theoretical computer science",
            "topic modeling",
            "word (group theory)"
        ],
        "openaire_categories_flat": [
            "03 medical and health sciences",
            "0302 clinical medicine",
            "05 social sciences",
            "0501 psychology and cognitive sciences",
            "2. zero hunger",
            "artificial intelligence",
            "bayes theorem",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer simulation",
            "executive function",
            "functional architecture",
            "humans",
            "language localization",
            "language localizer",
            "models, biological",
            "modularity",
            "nonlinear dynamics",
            "pharmacokinetics",
            "pharmacology, clinical",
            "psychology",
            "reproducibility of results",
            "software"
        ]
    },
    {
        "title": "StructGPT: A General Framework for Large Language Model to Reason over Structured Data",
        "doi": "10.18653/v1/2023.emnlp-main.574",
        "abstract": "In this paper, we study how to improve the zero-shot reasoning ability of large language models~(LLMs) over structured data in a unified way. Inspired by the study on tool augmentation for LLMs, we develop an \\emph{Iterative Reading-then-Reasoning~(IRR)} approach for solving question answering tasks based on structured data, called \\textbf{StructGPT}. In our approach, we construct the specialized function to collect relevant evidence from structured data (\\ie \\emph{reading}), and let LLMs concentrate the reasoning task based on the collected information (\\ie \\emph{reasoning}). Specially, we propose an \\emph{invoking-linearization-generation} procedure to support LLMs in reasoning on the structured data with the help of the external interfaces. By iterating this procedures with provided interfaces, our approach can gradually approach the target answer to a given query. Extensive experiments conducted on three types of structured data demonstrate the effectiveness of our approach, which can significantly boost the performance of ChatGPT and achieve comparable performance against the full-data supervised-tuning baselines. Our codes and data are publicly available at~\\url{https://github.com/RUCAIBox/StructGPT}.",
        "orkg_categories_flat": [
            "development and evaluation of a sota llm-based nl-to-sql translation interface"
        ],
        "papers_with_code_categories_flat": [
            "language modeling",
            "language modelling",
            "large language model",
            "question answering"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "case-based reasoning",
            "computer science",
            "construct (python library)",
            "deductive reasoning",
            "economics",
            "linguistics",
            "management",
            "natural language processing",
            "natural language processing techniques",
            "philosophy",
            "programming language",
            "reading (process)",
            "task (project management)",
            "text and document classification technologies",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "MMBench: Is Your Multi-modal Model an All-Around Player?",
        "doi": "10.1007/978-3-031-72658-3_13",
        "abstract": "Accepted in ECCV2024 as Oral Presentation",
        "orkg_categories_flat": [
            "develops a comprehensive evaluation pipeline, primarily comprised of two elements",
            "exploring evaluation benchmarks for large language models",
            "general language task",
            "multimodal llms"
        ],
        "papers_with_code_categories_flat": [
            "all",
            "instruction following",
            "multiple-choice",
            "visual question answering"
        ],
        "openalex_categories_flat": [
            "composite material",
            "computer science",
            "materials science",
            "modal",
            "multimodal machine learning applications",
            "natural language processing techniques",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language",
            "computer science - computer vision and pattern recognition",
            "computer vision and pattern recognition"
        ]
    },
    {
        "title": "MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models",
        "doi": "10.48550/arxiv.2306.13394",
        "abstract": "Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform multimodal tasks, showing amazing emergent abilities in recent studies, such as writing poems based on an image. However, it is difficult for these case studies to fully reflect the performance of MLLM, lacking a comprehensive evaluation. In this paper, we fill in this blank, presenting the first comprehensive MLLM Evaluation benchmark MME. It measures both perception and cognition abilities on a total of 14 subtasks. In order to avoid data leakage that may arise from direct use of public datasets for evaluation, the annotations of instruction-answer pairs are all manually designed. The concise instruction design allows us to fairly compare MLLMs, instead of struggling in prompt engineering. Besides, with such an instruction, we can also easily carry out quantitative statistics. A total of 30 advanced MLLMs are comprehensively evaluated on our MME, which not only suggests that existing MLLMs still have a large room for improvement, but also reveals the potential directions for the subsequent model optimization. The data application manner and online leaderboards are released at https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation.",
        "orkg_categories_flat": [
            "coarse-grained recognition",
            "exploring evaluation benchmarks for large language models",
            "fine-grained recognition",
            "general language task",
            "measures both perception and cognition abilities on a total of 14 subtasks",
            "multimodal llms",
            "numerical calculation",
            "optical character recognition",
            "text translation"
        ],
        "papers_with_code_categories_flat": [
            "benchmarking",
            "language modeling",
            "language modelling",
            "large language model",
            "mme",
            "model optimization",
            "multimodal large language model",
            "prompt engineering"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "benchmark (surveying)",
            "biology",
            "computer science",
            "geodesy",
            "geography",
            "machine learning",
            "multimodal machine learning applications",
            "natural language processing",
            "natural language processing techniques",
            "neuroscience",
            "perception",
            "selection (genetic algorithm)",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "artificial intelligence",
            "computation and language",
            "computer and information sciences",
            "computer science - artificial intelligence",
            "computer science - computation and language",
            "computer science - computer vision and pattern recognition",
            "computer vision and pattern recognition"
        ]
    },
    {
        "title": "Xiezhi: An Ever-Updating Benchmark for Holistic Domain Knowledge Evaluation",
        "doi": "10.1609/aaai.v38i16.29767",
        "abstract": "<jats:p>New Natural Langauge Process~(NLP) benchmarks are urgently needed to align with the rapid development of large language models (LLMs). We present Xiezhi, the most comprehensive evaluation suite designed to assess holistic domain knowledge.Xiezhi comprises multiple-choice questions across 516 diverse disciplines ranging from 13 different subjects with 249,587 questions and accompanied by Xiezhi-Specialty with 14,041 questions and Xiezhi-Interdiscipline with 10,746 questions. We conduct evaluation of the 47 cutting-edge LLMs on Xiezhi. Results indicate that LLMs exceed average performance of humans in science, engineering, agronomy, medicine, and art, but fall short in economics, jurisprudence, pedagogy, literature, history, and management. All the evaluation code and data are open sourced in https://github.com/MikeGu721/XiezhiBenchmark</jats:p>",
        "orkg_categories_flat": [
            "comprehensive domain knowledge",
            "designed to assess holistic domain knowledge",
            "exploring evaluation benchmarks for large language models",
            "general language task"
        ],
        "papers_with_code_categories_flat": [
            "align",
            "computer vision",
            "jurisprudence",
            "management",
            "multiple-choice",
            "vision and language pre-trained models"
        ],
        "openalex_categories_flat": [
            "artificial intelligence",
            "benchmark (surveying)",
            "cartography",
            "computer science",
            "domain (mathematical analysis)",
            "domain knowledge",
            "geography",
            "knowledge management",
            "mathematical analysis",
            "mathematics",
            "semantic web and ontologies"
        ],
        "openaire_categories_flat": [
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    },
    {
        "title": "C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models",
        "doi": "10.48550/arxiv.2305.08322",
        "abstract": "New NLP benchmarks are urgently needed to align with the rapid development of large language models (LLMs). We present C-Eval, the first comprehensive Chinese evaluation suite designed to assess advanced knowledge and reasoning abilities of foundation models in a Chinese context. C-Eval comprises multiple-choice questions across four difficulty levels: middle school, high school, college, and professional. The questions span 52 diverse disciplines, ranging from humanities to science and engineering. C-Eval is accompanied by C-Eval Hard, a subset of very challenging subjects in C-Eval that requires advanced reasoning abilities to solve. We conduct a comprehensive evaluation of the most advanced LLMs on C-Eval, including both English- and Chinese-oriented models. Results indicate that only GPT-4 could achieve an average accuracy of over 60%, suggesting that there is still significant room for improvement for current LLMs. We anticipate C-Eval will help analyze important strengths and shortcomings of foundation models, and foster their development and growth for Chinese users.",
        "orkg_categories_flat": [
            "chinese evaluation",
            "designed to assess advanced knowledge and reasoning abilities of foundation models in a chinese context",
            "exploring evaluation benchmarks for large language models",
            "five shot",
            "general language task",
            "zero shot"
        ],
        "papers_with_code_categories_flat": [
            "absolute position encodings",
            "adam",
            "align",
            "attention",
            "attention mechanisms",
            "attention modules",
            "bpe",
            "computer vision",
            "dense connections",
            "dropout",
            "feedforward networks",
            "general",
            "gpt-4",
            "label smoothing",
            "language models",
            "layer normalization",
            "linear layer",
            "multi-head attention",
            "multiple-choice",
            "natural language processing",
            "normalization",
            "output functions",
            "position embeddings",
            "position-wise feed-forward layer",
            "regularization",
            "residual connection",
            "skip connections",
            "softmax",
            "stochastic optimization",
            "subword segmentation",
            "transformer",
            "transformers",
            "vision and language pre-trained models"
        ],
        "openalex_categories_flat": [
            "archaeology",
            "artificial intelligence",
            "computer science",
            "context (archaeology)",
            "foundation (evidence)",
            "geography",
            "mathematics education",
            "natural language processing techniques",
            "psychology",
            "suite",
            "text readability and simplification",
            "topic modeling"
        ],
        "openaire_categories_flat": [
            "4. education",
            "computation and language",
            "computer and information sciences",
            "computer science - computation and language"
        ]
    }
]