In [None]:

import tokenize
import io
import os
import json

# generate, get_all_tokens were taken from radon library: https://github.com/rubik/radon

def generate(code):
    '''Pass the code into `tokenize.generate_tokens` and convert the result
    into a list.
    '''
    # tokenize.generate_tokens is an undocumented function accepting text
    return list(tokenize.generate_tokens(io.StringIO(code).readline))
    
def get_all_tokens(line, lines):
    '''Starting from *line*, generate the necessary tokens which represent the
    shortest tokenization possible. This is done by catching
    :exc:`tokenize.TokenError` when a multi-line string or statement is
    encountered.
    :returns: tokens, lines
    '''
    buffer = line
    used_lines = [line]
    while True:
        try:
            tokens = generate(buffer)
        except tokenize.TokenError:
            # A multi-line string or statement has been encountered:
            # start adding lines and stop when tokenize stops complaining
            pass
        else:
            if not any(t[0] == tokenize.ERRORTOKEN for t in tokens):
                return tokens, used_lines

        # Add another line
        next_line = next(lines)
        buffer = buffer + '\n' + next_line
        used_lines.append(next_line)

def remove_all_comments(code):
    '''Remove all comments from *code* and return them separately.
    '''
    code_lines = code.splitlines()
    modified_code_lines = []
    removed_comments_lines = []
    lines = iter(code_lines)

    for line in lines:
        try:
            tokens, parsed_lines = get_all_tokens(line, lines)
        except StopIteration:
            # end of source code reached
            break

        has_comment = False
        has_docstring = False

        for tok in tokens:
            if tok.type == tokenize.COMMENT:
                if line.strip().startswith('#'):
                    removed_comments_lines.extend(parsed_lines)
                    has_comment = True
                else:
                    no_inline_comment = parsed_lines[0].split('#', 1)[0]
                    modified_code_lines.append(no_inline_comment)
                    has_comment = True
            elif tok.type == tokenize.STRING and (line.strip().startswith('"""') or line.strip().startswith("'''")):
                removed_comments_lines.extend(parsed_lines)
                has_docstring = True

        # if neither a comment nor docstring is found, add the line to modified_code_lines
        if not has_comment and not has_docstring:
            modified_code_lines.extend(parsed_lines)
            
    return '\n'.join(modified_code_lines), removed_comments_lines

In [None]:


source_directory = ""
destination_directory = ""
output_file_path = "" 

if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)


modified_codes = []

for file in os.listdir(source_directory):
    if file.endswith(".py"):
        print("processing file: ", file)
        with open(os.path.join(source_directory, file), 'r') as f:
            code = f.read()
            modified_code, removed_comments_lines = remove_all_comments(code)
            with open(os.path.join(destination_directory, file), 'w') as f:
                f.write(modified_code)
            
if modified_codes:
    with open(f"{output_file_path}/removed_comments_info.json", "w") as f:
        json.dump(modified_codes, f, indent=4)