from packaging.specifiers import SpecifierSet
from packaging.requirements import Requirement
from packaging.version import Version
import json
import pandas as pd
import os
from copy import deepcopy
from tqdm import tqdm
import pdb
import logging
import pickle as pk
from datetime import datetime

logger = logging.getLogger('Reverser')
streamhandler = logging.StreamHandler()
logger.addHandler(streamhandler)
logger.setLevel(logging.INFO)

fail_list = []

def gen_rev_deps(df:pd.DataFrame)->dict:
    """:return dict of reversed_deps
    >>> index
    (package_name, version)
    >>> d[index].keys()
    'name', 'version', 'dependents'
    >>> d[index]['dependents']
    set
    """
    d = {}
    for index, row in df.iterrows():
        item = row.to_dict()
        if 'dependents' in item:
            # stored in list, calculated in set
            item['dependents'] = set(json.loads(item['dependents']))
        item["name"] = index[0]
        item["version"] = index[1]
        d[tuple(index)] = item
    return d

def gen_vis(df:pd.DataFrame)->dict:
    """:return dict of visited Specifiersets
    >>> index
    package_name
    >>> d[index].keys()
    'name','visited_spec'
    >>> type(d[index]['visited_spec])
    set
    """
    d = {}
    for index, row in df.iterrows():
        item = row.to_dict()
        if 'visited_spec' in item:
            # stored in list, calculated in set
            item['visited_spec'] = set(json.loads(item['visited_spec']))
        item["name"] = index
        d[index] = item
    return d

def gen_deps(df:pd.DataFrame)->dict:
    """:return dict of dependencies
    >>> index
    (package_name, version)
    >>> d[index].keys()
    'name', 'version', 'date', 'deps', 'raw_dependencies'
    """
    d = {}
    for index, row in df.iterrows():
        item = row.to_dict()
        item["name"] = index[0]
        item["version"] = index[1]
        d[tuple(index)] = item
    return d

def gen_dependents(df:pd.DataFrame)->dict:
    """:return a dict of depdendents
    >>> index
    (package, version)
    >>> type(d[index])
    dict
    >>> d[index].keys()
    (package, version)
    >>> d[index][dependent_index].keys()
    'distance':int, 'father':set
    """
    d = dict()
    for index, row in df.iterrows():
        item  = row.to_dict()
        item["name"] = index[0]
        item["version"] = index[1]
        dependents_ls = json.loads(item['dependents'])
        item['dependents'] = dict()
        for dependent_info in dependents_ls:
            dependent_name = dependent_info['dependent_name']
            dependent_version = dependent_info['dependent_version']
            dependent_distance = dependent_info['distance']
            father_set = set()
            for father in dependent_info['father']:
                father_set.add(tuple(father))
            # father_set = set()
            cur_idx = (dependent_name, dependent_version)
            item["dependents"][cur_idx] = dict()
            item['dependents'][cur_idx]['distance'] = dependent_distance
            item['dependents'][cur_idx]['father'] = father_set
        d[tuple(index)] = item
    return d 

def dependent_save(total_dependents:dict, save_path:str ):
    """return csv format
    """
    depen_save = dict()
    for index, content in total_dependents.items():
        cur_dependents = deepcopy(content['dependents'])
        current_dependents_list = []
        # dict to ls
        depen_save[index] = dict()
        depen_save[index]['name'] = index[0]
        depen_save[index]['version'] = index[1]
        for idx, dependent_content in cur_dependents.items():
            cur_dict = dict()
            cur_dict['dependent_name'] = idx[0]
            cur_dict['dependent_version'] = idx[1]
            cur_dict['distance'] = dependent_content['distance']
            cur_dict['father'] = list(dependent_content['father'])
            current_dependents_list.append(deepcopy(cur_dict))
        depen_save[index]['dependents'] = json.dumps(deepcopy(current_dependents_list))
    dependents = pd.DataFrame(depen_save.values()).\
                              sort_values(["name", "version"]).\
                              set_index(["name","version"], drop=True)
                              
    dependents.to_csv(os.path.join(save_path, "total_dependents.csv"))
        
        # depen_save[index] = deepcopy(content)
        # cur_dependents_dict = content['dependents']
        # cur_dependents_dict['father'] = list(cur_dependents_dict['father'])
        # cur_dependents_dict['dependent_name'] = 
        # cur_dependents_dict['dependent_version'] = 
    

class Reverser:
    def __init__(self,metadata_path:str, intermediates_path:str, use_intermediates:bool, dump_path=None):
        # metadata_path = '../inactive_task/metadata'
        if os.path.isfile('./releases_info.pkl'):
            logger.info(f"use releases_info cache")
            with open('releases_info.pkl','rb') as f:
                self.releases_info = pk.load(f)
        else:
            self.releases_info = self.load_releases(path= metadata_path)
            with open('releases_info.pkl','wb') as f:
                pk.dump(self.releases_info, f)
        self.intermediates_path = intermediates_path
        self.dump_path = dump_path
        if use_intermediates:
            self.load_intermediates()
        else:
            logger.info("initialize null intermediates")
            self.reversed_deps = dict()
            self.visited_specifier = dict()

    def load_releases(self, path:str) -> dict:
        """: return releases info from metadata package
        path: metadata path 
        >>> releases_info[package_name]['releases']
        ['0.1.0','0.2.0','0.3.0']
        >>> releases_info[package_name]['latest']
        '0.3.0'
        """
        releases_info = dict()
        logger.info('Loading releases info from metadata...')
        # releases['shap'] = ['39.0','38.1'...]
        for fname in tqdm(os.listdir(path)):
            package_name = fname[:-5]
            fpath = os.path.join(path,fname)
            releases_info[package_name] = dict()
            releases_info[package_name]['releases'] = []
            with open(fpath, 'r') as f:
                metadata = json.load(f)
            releases_info[package_name]['latest'] = metadata['info']['version']
            keys = list(metadata['releases'].keys())
            for release in keys:
                if len(metadata['releases'][release]) > 0:
                    releases_info[package_name]['releases'].append(release)
        logger.info("Releases loaded")
        return releases_info

    def load_intermediates(self, ) -> None:
        """Load intermediates. Load files if exist
        """
        logger.info("Loading intermeidates")
        reversed_deps_fname = 'reversed_dep.csv'
        reversed_fpath = os.path.join(self.intermediates_path, reversed_deps_fname)
        visited_specifier_fname = 'visited_specifier.csv'
        visited_fpath = os.path.join(self.intermediates_path, visited_specifier_fname)

        if os.path.isfile(reversed_fpath):
            logger.info(f"{reversed_deps_fname} exists")
            t0 = datetime.now()
            self.reversed_deps = gen_rev_deps(pd.read_csv(reversed_fpath, index_col=['name', 'version']))
            t1 = datetime.now()
            logger.info(f"{len(self.reversed_deps)} releases loaded. Time cost: {t1-t0}")
            # with open(reversed_fpath, 'rb') as f:
            #     self.reversed_deps = pk.load(f)
        else:
            logger.info(f"{reversed_deps_fname} does not exist")
            self.reversed_deps = dict()

        if os.path.isfile(visited_fpath):
            logger.info(f"{visited_specifier_fname} exists")
            t0 = datetime.now()
            self.visited_specifier = gen_vis(pd.read_csv(visited_fpath, index_col=['name']))
            t1 = datetime.now()
            logger.info(f"{len(self.visited_specifier)} sets of package loaded. Time cost: {t1-t0}")
            # with open(visited_fpath, 'rb') as f:
            #     self.visited_specifier = pk.load(f)
        else:
            logger.info(f"{visited_specifier_fname} does not exist")
            self.visited_specifier = dict()

    def dump_intermediates(self, dump_path=None)->None:
        """dump intermediates to csv
        convert dict to pd.DataFrame and set to str
        """
        if dump_path == None:
            dump_path = self.intermediates_path
            logger.info(f"Dump intermediates to {self.intermediates_path} (default path)")
        else:
            logger.info(f"Dump intermediates to {dump_path}")

        rev_deps = dict()
        for idx, content in self.reversed_deps.items():
            rev_deps[idx] = deepcopy(content)
            rev_deps[idx]['dependents'] = json.dumps(list(content['dependents']))
        dependents = pd.DataFrame(rev_deps.values()).\
            sort_values(["name", "version"]).\
            set_index(["name", "version"], drop=True)
        dependents.to_csv(os.path.join(dump_path, 'reversed_dep.csv'))

        vis_spec = dict()
        for idx, content in self.visited_specifier.items():
            vis_spec[idx] = deepcopy(content)
            vis_spec[idx]['visited_spec'] = json.dumps(list(content['visited_spec']))
        visited_specifier = pd.DataFrame(vis_spec.values()).\
            set_index(["name"], drop=True)
        visited_specifier.to_csv(os.path.join(dump_path, 'visited_specifier.csv'))

    def reverse_deps(self, dep:pd.Series)->None:
        """for given deps, do reverse
        >>> dep
        name                        0-core-client
        version                           1.1.0a4
        date                           2017-06-21
        deps                                redis
        raw_dependencies    {"redis": ">=2.10.5"}

        output scheme: {name:(redis, 2.10.5), dependents '{0-core-client, }'}
        """
        package_name = dep['name']
        dep_version = dep['version']
        dependencies = json.loads(dep['raw_dependencies'])
        logger.debug(f"reversing {package_name}, {dep_version}'s deps:{dependencies}")

        if package_name not in self.visited_specifier:
            # initialize self.visited_specifier[package_name]
            self.visited_specifier[package_name] = dict()
            self.visited_specifier[package_name]['name'] = package_name
            self.visited_specifier[package_name]['visited_spec'] = set()

        for depend_name, specs in dependencies.items():
            clean_specs = specs
            if ';' in specs:
                # remove markers
                clean_specs = specs.split(';')[0]
            # clean_specs = clean_specs.strip(' ').lstrip('(').rstrip(')')
            try:

                req = Requirement(depend_name+' '+clean_specs)
            except:
                fail_list.append((package_name, dep_version, depend_name, specs))
                continue
            spec = req.specifier
            spec.prereleases = True
            # spec = SpecifierSet(clean_specs, prereleases=True)
            logger.debug(f"ori_specs: {specs}, spec:{spec}")
            
            if spec in self.visited_specifier[package_name]['visited_spec']:
                continue     
            else:
                self.visited_specifier[package_name]['visited_spec'].add(str(spec))
                
                # the dependency is removed from pypi by project owner
                if depend_name not in self.releases_info:
                    continue
                for release in self.releases_info[depend_name]['releases']:
                    is_contained = False
                    try:
                        is_contained = (release in spec)
                    except:
                        pass
                    if is_contained:
                        name_ver_index = tuple([depend_name,release])
                        if name_ver_index not in self.reversed_deps:
                            # initialize self.reversed_deps
                            self.reversed_deps[name_ver_index] = dict()
                            self.reversed_deps[name_ver_index]['name'] = depend_name
                            self.reversed_deps[name_ver_index]['version'] = release
                            self.reversed_deps[name_ver_index]['dependents'] = set()
                        self.reversed_deps[name_ver_index]['dependents'].add(package_name)

    def reverse_all_deps(self, dep_fpath:str, save:bool)->None:
        """for a given dependencies.csv, reverse all its deps and save to intermediates
        """
        logger.info(f"reversing {dep_fpath}...")
        dep_df = pd.read_csv(dep_fpath)

        for _, row in tqdm(dep_df.iterrows()):
            self.reverse_deps(row)
        if save:
            self.dump_intermediates(self.dump_path)



def query(package_name:str, version:str, reversed_deps:dict, deps:dict, releases_info:dict)->dict:
    """:return a dict of dependents list
    >>> index
    tuple([package_name, version])
    >>> dic[index]['distance']
    n # "n" denotes the minimum distance from vulnerable package
    >>> dic[index]['father']
    [a,b,c]
    """
    logger.debug(f"traversing {package_name}:{version} dependents")
    index = tuple([package_name,version])
    dependents = dict()
    dependents[index] = {"distance":0, "father":set()}
    queue = [index]
    q_index = 0 
    # Do bfs
    # accident_append = 0
    while len(queue)>q_index:
        print(f"{100*q_index/len(queue)}% {q_index} / {len(queue)}")
        cur_index = queue[q_index]
        logger.debug(f"iteratering queue({len(queue)-q_index}), current_index: {cur_index}")
        cur_distance = dependents[cur_index]['distance'] + 1 
        q_index += 1

        if cur_index not in reversed_deps:
            continue
        downstream_packages = reversed_deps[cur_index]['dependents']
        for package in downstream_packages:
            releases = releases_info[package]['releases']
            for release in releases:
                down_index = tuple([package, release])

                def is_dep(down_index:tuple, fa_index:tuple, deps:dict)->bool:
                    """ return whether down_index depends on fa_index
                    """
                    if down_index not in deps:
                        return False
                    down_dependencies = json.loads(deps[down_index]['raw_dependencies'])
                    if fa_index[0] not in down_dependencies:
                        return False
                    specs = down_dependencies[fa_index[0]]
                    clean_specs = specs
                    if ';' in specs:
                        # remove markers
                        clean_specs = specs.split(';')[0]
                    try:
                        req = Requirement(fa_index[0]+' '+clean_specs)
                    except:
                        fail_list.append((down_index[0], down_index[1], fa_index[0], specs))
                        return False
                    spec = req.specifier
                    spec.prereleases = True
                    logger.debug(f"ori_specs: {specs}, spec:{spec}")

                    is_contained = False
                    try:
                        is_contained = (fa_index[1] in spec)
                    except:
                        pass
                    return is_contained

                if is_dep(down_index=down_index,
                          fa_index=cur_index,
                          deps=deps):
                    if down_index not in dependents:
                        empty_set = set()
                        empty_set.add(cur_index)
                        dependents[down_index] = {"distance":cur_distance, "father":deepcopy(empty_set)}
                        queue.append(down_index)
                    else:
                        dependents[down_index]['father'].add(cur_index)
                        # if cur_distance<dependents[down_index]['distance']:
                        #     dependents[down_index]['distance'] = cur_distance
                        #     if down_index not in queue:
                        #         queue.append(down_index)
                        #         print("Noooooooooo")
                        #         accident_append+=1
    # print(accident_append)
    return dependents

class Qy:
    """reducing params to invoke query faster
    """
    def __init__(self, 
                 deps_fpaths:list, 
                 metadata_path:str, 
                 intermediates_path:str,
                 cache_path:str,
                 reversed_deps:dict,
                 releases_info:dict
                 ) -> None:
        logger.info("Initializing Qy (query) class")
        self.deps_fpaths = deps_fpaths
        self.metadata_path = metadata_path
        self.intermediates_path = intermediates_path
        self.cache_path = cache_path
        self.reversed_deps = reversed_deps
        self.releases_info = releases_info
        # if os.path.isfile('merge_deps.pkl') == False:
            # self._load_deps_to_dict()
        self.merge_deps = self._merge_deps_together()
        cache_fpath = os.path.join(cache_path, 'total_dependents.csv')
        if os.path.isfile(cache_fpath):
            logger.info("Loading total dependents cache...")
            self.total_dependents = gen_dependents(pd.read_csv(cache_fpath, index_col=["name","version"]))
        else:
            logger.info("No total depdendents cached, create a new file")
            self.total_dependents = dict()

    def _load_deps_to_dict(self,)->None:
        self.deps = []
        logger.info(f"Loading {len(self.deps_fpaths)} deps files")
        for path in self.deps_fpaths:
            t0 = datetime.now()
            self.deps.append(gen_deps( pd.read_csv(path, index_col=["name","version"])))
            t1 = datetime.now()
            logger.info(f"{len(self.deps[-1])} deps loaded. Time cost:{t1-t0}")

    def _merge_deps_together(self)->dict:
        """:return merged deps in dict format
        """
        if os.path.isfile('merge_deps.pkl'):
            logger.info("Loading cached merge_deps.pkl...")
            with open('merge_deps.pkl','rb') as f:
                total_deps = pk.load(f)
            return total_deps
        logger.info("Merging deps...")
        self._load_deps_to_dict()
        deps_list = self.deps
        total_deps = deepcopy(deps_list[0])
        for i in range(1, len(deps_list)):
            deps = deps_list[i]
            for idx, item in deps.items():
                if idx not in total_deps:
                    total_deps[idx] = deepcopy(item)
        pk.dump(total_deps, open('merge_deps.pkl','wb'))
        return total_deps

    def _dfs(self,name:str,version:str)->None:
        """update the dependents of name,version
        if exists in total_dependents
        return
        """
        cur_index = tuple([name,version])
        logger.info(f"traversing {cur_index}")
        if cur_index in self.total_dependents:
            # dependents = self.total_dependents[cur_index]
            return
        self.total_dependents[cur_index] = dict()
        self.total_dependents[cur_index]["name"] = name
        self.total_dependents[cur_index]["version"] = version
        self.total_dependents[cur_index]["dependents"] = dict() 
        self.total_dependents[cur_index]["dependents"][cur_index] = {"distance":-1, "father":set()}
        dependents = dict()
        dependents["name"] = name
        dependents["version"] = version
        dependents["dependents"] = dict()
        dependents["dependents"][cur_index] = {"distance":0, "father":set()}
        if cur_index not in self.reversed_deps:
            self.total_dependents[cur_index] = deepcopy(dependents)
            return
        
        downstream_packages = self.reversed_deps[cur_index]['dependents']
        for package in downstream_packages:
            releases = self.releases_info[package]['releases']
            for release in releases:
                down_index = tuple([package, release])

                def is_dep(down_index:tuple, fa_index:tuple, deps:dict)->bool:
                    """ return whether down_index depends on fa_index
                    """
                    if down_index not in deps:
                        return False
                    down_dependencies = json.loads(deps[down_index]['raw_dependencies'])
                    if fa_index[0] not in down_dependencies:
                        return False
                    specs = down_dependencies[fa_index[0]]
                    clean_specs = specs
                    if ';' in specs:
                        # remove markers
                        clean_specs = specs.split(';')[0]
                    try:
                        req = Requirement(fa_index[0]+' '+clean_specs)
                    except:
                        fail_list.append((down_index[0], down_index[1], fa_index[0], specs))
                        return False
                    spec = req.specifier
                    spec.prereleases = True
                    logger.debug(f"ori_specs: {specs}, spec:{spec}")

                    is_contained = False
                    try:
                        is_contained = (fa_index[1] in spec)
                    except:
                        pass
                    return is_contained
                
                if is_dep(down_index=down_index,
                          fa_index=cur_index,
                          deps=self.merge_deps):
                    # if down_index == ('dowhy', '0.2'):
                    #     print(cur_index, 'hahhah')
                    #     pdb.set_trace()
                    self._dfs(down_index[0], down_index[1])
                    # if 'dependents' not in self.total_dependents[down_index]:
                    #     # down_index has been visited previously, but hasn't been updated yet
                    #     dependents["dependents"] = dict()
                    #     dependents["dependents"][down_index] = {"distance":-1,
                    #                               "father":set()}

                    for idx, item in self.total_dependents[down_index]['dependents'].items():

                        down_distance = item['distance']
                        down_father = deepcopy(item['father'])
                        # if idx == ('eval-hj3415', '0.0.8'):
                        #     pdb.set_trace()
                        if down_distance == 0:
                            down_father.add(cur_index)
                        # dependents["dependents"][down_index] = {"distance":down_distance+1,
                        #                                         "father":deepcopy(down_father)}
                        # if down_distance == -1:
                        #     pdb.set_trace()
                        if idx not in dependents["dependents"].keys():
                            if down_distance == -1:
                                dependents["dependents"][idx] = {"distance":-1,
                                                                 "father":set()}
                                if down_index == idx:
                                    dependents["dependents"][idx]["father"].add(cur_index)
                                    self.total_dependents[idx]["dependents"][idx]["father"].add(cur_index)
                            else:
                                dependents["dependents"][idx] = {"distance":down_distance+1,
                                                                 "father":deepcopy(down_father)}
                                # for fa in down_father:
                                #     self.total_dependents[idx]["dependents"][idx]["father"].add(fa)
                        else:
                            if down_distance == -1 :
                                if idx == cur_index:
                                    continue
                                else:
                                    dependents["dependents"][idx]['distance'] = -1
                                if down_index == idx:
                                    dependents["dependents"][idx]["father"].add(cur_index)
                                    self.total_dependents[idx]["dependents"][idx]["father"].add(cur_index)
                            else:
                                dependents["dependents"][idx]['distance'] = min(dependents["dependents"][idx]['distance'],
                                                                                down_distance+1)
                            if down_distance == 0 :
                                dependents["dependents"][idx]["father"].add(cur_index)
                                self.total_dependents[idx]["dependents"][idx]["father"].add(cur_index)

        for idx, item in dependents["dependents"].items():
            if cur_index in dependents["dependents"][idx]["father"]:
                self.total_dependents[idx]["dependents"][idx]["father"].add(cur_index)
            if idx == cur_index:
                self.total_dependents[cur_index]["dependents"][idx]["distance"] = 0
                continue
            self.total_dependents[cur_index]["dependents"][idx] = deepcopy(item)
        # self.total_dependents[cur_index] = deepcopy(depensdents)
        return
        
    def _clean_dirty_dependents(self, name, version)->None:
        """clean dependents with negative distance
        After query task, there are some index with negtive distance dependents.
        Because there are loops in dependency network.
        When dfs traversing the network, distance of visited node in the query will be assigned -1 to avoid infinite loop.
        The dependents of that visited node is unknown before its dfs finished, also for the nodes who is upstream of the visited node.
        After we finished the dfs, we can finally update the dependents of the visited nodes
        """
        cur_index = tuple([name,version])
        alpha_dependents = deepcopy(self.total_dependents[cur_index]["dependents"])
        clean_dependents = dict()
        dirty_dependents = dict()
        # find out index with neg-dist dependents in this query
        for index in alpha_dependents.keys():
            item = self.total_dependents[index]["dependents"]
            # item.keys() == ("a","0.1.0") ...
            flag = True
            for idx, details in item.items():
                # details.keys() == "name", "version", "distance", "father"
                if details["distance"] == -1:
                    flag = False
                    if index not in dirty_dependents:
                        # dirty_dependents[index] = dict()
                        dirty_dependents[index] = deepcopy(self.total_dependents[index])
                        dirty_dependents[index]["neg_list"] = []
                    dirty_dependents[index]["neg_list"].append(idx)
            if flag:
                clean_dependents[index] = deepcopy(self.total_dependents[index])

        logger.info(f"cleaning {len(dirty_dependents)} dirty depdendents...")
        while len(dirty_dependents):
            # find out update index in this round
            del_index = None
            for index, item in dirty_dependents.items():
                flag = True
                for idx in item["neg_list"]:
                    if idx not in clean_dependents:
                        flag = False
                        break
                if flag:
                    del_index = index
            if del_index == None:
                break
            
            cur_item = deepcopy(dirty_dependents[del_index])
            neg_set = set(deepcopy(dirty_dependents[del_index]["neg_list"]))
            cur_item.pop("neg_list")
            # zero_indegree_idx = 
            # if del_index[0] == "shap":
            #     pdb.set_trace()
            while(len(neg_set)>0):

                neg_list = list(neg_set)
                del_neg_idx = None
                for idx in neg_list:
                    flag = False
                    fathers = clean_dependents[idx]["dependents"][idx]["father"]
                    for fa in fathers:
                        if fa in cur_item["dependents"]:
                            
                            cur_dis = cur_item["dependents"][fa]["distance"]
                            # if del_index[0] == "shap":
                            #     print(neg_list, idx, fathers, fa, cur_dis, sep="\n")
                            #     pdb.set_trace()
                            if cur_dis != -1:
                                flag = True
                                break
                    if flag:
                        del_neg_idx = idx
                        break
                assert del_neg_idx != None, "can not find del_neg_idx"
                
                idx = del_neg_idx
                neg_set.remove(idx)
            # for idx in neg_list:
                # update distance of neg-distance node
                # fathers = dirty_dependents[del_index]["dependents"][idx]["father"]
                fathers = clean_dependents[idx]["dependents"][idx]["father"]
                # fathers is a set: {("a","0.2"), ...}
                shortest_dis = None
                for fa in fathers:
                    if fa in cur_item["dependents"]:
                        cur_item["dependents"][idx]["father"].add(fa)
                        cur_dis = cur_item["dependents"][fa]["distance"]+1
                        
                        if cur_dis>0 and (shortest_dis == None or shortest_dis > cur_dis):
                            shortest_dis = cur_dis
                assert shortest_dis != None, "neg-distance node has no distance"
                cur_item["dependents"][idx]["distance"] = shortest_dis
                # append neg-node's dependents to del_index
                for neg_depents, details in clean_dependents[idx]["dependents"].items():
                    depent_dis = details["distance"] + shortest_dis
                    # depend_father = 
                    if neg_depents not in cur_item["dependents"]:
                        cur_item["dependents"][neg_depents] = {
                            "distance":depent_dis,
                            "father":set()
                        }
                    else:
                        cur_item["dependents"][neg_depents]["distance"] =\
                            min(cur_item["dependents"][neg_depents]["distance"], depent_dis)
                    
                    # dirty_dependents[del_index]["dependents"]
                for neg_depents, details in clean_dependents[idx]["dependents"].items():
                    depent_father = details["father"]
                    for fa in depent_father:
                        if fa in cur_item["dependents"]:
                            cur_item["dependents"][neg_depents]["father"].add(fa)

                # clean_dependents[idx] = 0
                clean_dependents[del_index] = deepcopy(cur_item)

            dirty_dependents.pop(del_index)
        with open("dirty_dict.pkl","wb") as f:
            pk.dump(dirty_dependents,f)
        assert len(dirty_dependents) == 0, "dirty dependents haven't been fixed"
        for idx, item in clean_dependents.items():
            self.total_dependents[idx] = deepcopy(item)

    def query(self, name:str, version:str)->dict:
        """:return dependents of name-version pair
        utilizing dfs to accelerate query
        """
        # print("query",name,version)
        cur_index = tuple([name,version])
        logger.info(f"query {cur_index}")
        if cur_index not in self.total_dependents:
            self._dfs(name,version)
            # update loops in dependencies
            with open("httpx0220.pkl","wb") as f:
                pk.dump(self.total_dependents, f)
            self._clean_dirty_dependents(name,version)
            logger.info("cache dependents.csv updated")
            dependent_save(self.total_dependents, self.cache_path)
        return self.total_dependents[cur_index]

def query2(package_name:str, version:str, cahce_path:str, reversed_deps:dict, deps:dict, releases_info:dict):
    """:return a dict of dependents list
    >>> index
    tuple([package_name, version])
    >>> dic[index]['distance']
    n # "n" denotes the minimum distance from vulnerable package
    >>> dic[index]['father']
    [a,b,c]
    """
    cache_fpath = os.path.join(cahce_path, 'dependents.csv')
    if os.path.isfile(cache_fpath):
        total_dependents = gen_dependents(pd.read_csv(cache_fpath, index_col=["name","version"]))
    else:
        total_dependents = dict()
    logger.info()
    logger.debug(f"traversing {package_name}:{version} dependents")
    index = tuple([package_name,version])
    if index not in total_dependents:
        dependents = total_dependents[index]
    else:
        dependents = dict()
        dependents["name"] = package_name
        dependents["version"] = version
        dependents["dependents"] = {"distance":0, "father":set()}
        queue = [index]
        q_index = 0 
        # Do bfs
        # accident_append = 0
        while len(queue)>q_index:
            print(f"{100*q_index/len(queue)}% {q_index} / {len(queue)}")
            cur_index = queue[q_index]
            logger.debug(f"iteratering queue({len(queue)-q_index}), current_index: {cur_index}")
            cur_distance = dependents[cur_index]['distance'] + 1 
            q_index += 1

            if cur_index not in reversed_deps:
                continue
            downstream_packages = reversed_deps[cur_index]['dependents']
            for package in downstream_packages:
                releases = releases_info[package]['releases']
                for release in releases:
                    down_index = tuple([package, release])

                    def is_dep(down_index:tuple, fa_index:tuple, deps:dict)->bool:
                        """ return whether down_index depends on fa_index
                        """
                        if down_index not in deps:
                            return False
                        down_dependencies = json.loads(deps[down_index]['raw_dependencies'])
                        if fa_index[0] not in down_dependencies:
                            return False
                        specs = down_dependencies[fa_index[0]]
                        clean_specs = specs
                        if ';' in specs:
                            # remove markers
                            clean_specs = specs.split(';')[0]
                        try:
                            req = Requirement(fa_index[0]+' '+clean_specs)
                        except:
                            fail_list.append((down_index[0], down_index[1], fa_index[0], specs))
                            return False
                        spec = req.specifier
                        spec.prereleases = True
                        logger.debug(f"ori_specs: {specs}, spec:{spec}")

                        is_contained = False
                        try:
                            is_contained = (fa_index[1] in spec)
                        except:
                            pass
                        return is_contained

                    if is_dep(down_index=down_index,
                            fa_index=cur_index,
                            deps=deps):
                        if down_index not in dependents:
                            empty_set = set()
                            empty_set.add(cur_index)
                            dependents[down_index] = {"distance":cur_distance, "father":deepcopy(empty_set)}
                            queue.append(down_index)
                        else:
                            dependents[down_index]['father'].add(cur_index)
                            # if cur_distance<dependents[down_index]['distance']:
                            #     dependents[down_index]['distance'] = cur_distance
                            #     if down_index not in queue:
                            #         queue.append(down_index)
                            #         print("Noooooooooo")
                            #         accident_append+=1
        # print(accident_append)
    return dependents

def test(rev:Reverser, dep_fpath:str)->None:
    dep_df = gen_deps( pd.read_csv(dep_fpath, index_col=["name","version "]))
    rev.reverse_all_deps(dep_fpath=dep_fpath,
                         save=True)
    keys = list(rev.reversed_deps.keys())
    pdb.set_trace()
    print(keys[10:20])
    out = []
    for index in keys:
        out.append(query(index[0], index[1], rev.reversed_deps, dep_df, rev.releases_info))
    [dependents for dependents in out if len(dependents)>1]
    pdb.set_trace()
    # rev.reverse_deps(dep_df.iloc[1])

if __name__ == '__main__':
    metadata_path = '../inactive_task/metadata'
    intermediates_path = './reverse_deps'
    dependencies_path = '../ghd_dataset/dataset/pypi.cache/dependencies.csv'
    dependencies_path2 = '../ghd_dataset/dataset/pypi.cache/dependencies_sv1.csv'
    rev = Reverser( metadata_path=metadata_path,
                    intermediates_path=intermediates_path,
                    use_intermediates=True,
                    )
    
    rev.reverse_all_deps(dep_fpath=dependencies_path,
                         save=True)
    rev.reverse_all_deps(dep_fpath=dependencies_path2,
                         save=True)

    # test(rev=rev,
    #      dep_fpath=dependencies_path)
    # pdb.set_trace()
    # rev.reverse_all_deps(dep_fpath=dependencies_path,
    #                      save=False)

    # deps = gen_deps(pd.read_csv(dependencies_path, index_col=["name", "version"]))
    
    # query(  package_name='cryptoauthlib',
    #         version = "",
    #         reverse_deps=rev.reversed_deps,
    #         deps= deps,
    #         releases_info=rev.releases_info
    #       )
    