#!/usr/bin/env python

# Script takes a Rosecheckers output file and extracts its alert
# information
#
# The first argument indicates the file containing the input.
# The second argument specifies the output file.
#
# The script should take the text data via standard input. The data
# should be produced from a build process using make and g++.  A
# suitable command to generate the text data is:
#
# make 2>&! > makelog
#
# This script produces only one message per alert
#
# <legal># 
# Alert Type Frequency Assessment of Open-Source Static Analysis Tools and Codebases# 
# Copyright 2023 Carnegie Mellon University.# 
# 
# NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.# 
# 
# This work is licensed under a Creative Commons Attribution 4.0 International License. https://creativecommons.org/licenses/by/4.0/# 
# 
# [DISTRIBUTION STATEMENT A] This material has been approved for public release and unlimited distribution.  Please see Copyright notice for non-US Government use and distribution.# 
# 
# Carnegie Mellon® and CERT® are registered in the U.S. Patent and Trademark Office by Carnegie Mellon University.# 
# 
# This dataset was created with the use of various toolsets and data inputs each subject to its own license.# 
# 
# DM23-0522# 
# 
# </legal># 

import sys
import re
import os


def processFile(input_file, output_file):
    directory = ""
    file_path = None

    for line in input_file:
        line = line.strip()

        parse = re.match(r"^In directory: *(.*)$", line)
        if (parse != None):
            directory = parse.group(1)
            continue

        parse = re.match(r"^Compiler args are: .* (.*?)$", line)
        if (parse != None):
            file_path = parse.group(1)
            suffix = os.path.splitext(file_path)[1]
            if suffix not in [".c", ".h", ".cc", ".cxx", ".cpp", ".hpp"]:
                file_path = None
            else:
                if not os.path.isabs(file_path):
                    file_path = directory + "/" + file_path

        parse = re.match(r"^(.*?):([0-9]*)(?::[0-9]*)?: (warning|error): ([-A-Za-z0-9]*): (.*?) *$", line)

        if (parse == None):
            continue
        file_name = parse.group(1)
        line_number = parse.group(2)
        checker = parse.group(4)
        message = parse.group(5)
        message = message.strip().replace("\t", " ")

        if file_path is None:
            file_path = directory + "/" + file_name
            file_path = file_path.strip()

        column_values = "\t".join([checker, file_path, line_number, message])
        output_file.write(column_values + "\n")


if __name__ == "__main__":

    if len(sys.argv) != 3:
        raise TypeError("Usage: " + sys.argv[0] + " <raw-input> <tsv-output>")
    input_file = open(sys.argv[1], "r")
    output_file = open(sys.argv[2], "w")

    processFile(input_file, output_file)

    input_file.close()
    output_file.close()
