In [None]:
import re
from math import log
import json
from itertools import combinations
from collections import defaultdict
from typing import Any, Optional, Dict, Union, Callable, Tuple, Type, List
import numpy as np
import math
import random
from enum import Enum
from operator import itemgetter
from pathlib import Path

In [None]:
def overlap(s1: str, s2: str, **kwargs) -> float:
    s1 = set(s1.split())
    s2 = set(s2.split())
    intersection = len(s1 & s2)
    norm = log(len(s1)) + log(len(s2))
    return intersection / norm

In [None]:
ASCII = re.compile(r"[^a-z0-9]")
POS = re.compile(r"^[NJ]")

def norm_token(token: str) -> str:
    token = token.lower()
    return ASCII.sub("", token)

In [None]:
def norm_sentence(sent: str) -> str:
    tokens = sent.split()
    tokens = [norm_token(token) for token in tokens]
    return " ".join(tokens)

In [None]:
def build_vocab(tokens: List[str]) -> Dict[str, int]:
    vocab = defaultdict(lambda: len(vocab))
    for token in tokens:
        vocab[token]
    return {k: i for k, i in vocab.items()}

In [None]:
class Vertex:
    def __init__(self, value: str):
        self.value = value
        self._edges_out: Dict[int, float] = {} # integer from vocab mapping
        self._edges_in: Dict[int, float] = {}

    @property
    def edges_out(self) -> Dict[int, float]:
        return self._edges_out

    @property
    def edges_in(self) -> Dict[int, float]:
        return self._edges_in

    @property
    def degree_in(self) -> int:
        return len(self.edges_in)

    @property
    def degree_out(self) -> int:
        return len(self.edges_out)

    def __str__(self) -> str:
        return f"V(term={self.value}, in={self.degree_in}, out={self.degree_out})"

    def __eq__(self, other) -> bool:
        if not isinstance(other, Vertex):
            raise TypeError(f"Can only compare to other Vertex objects, got {type(other)}")
        if self is other:
            return True
        if self.value != other.value:
            return False
        if self._edges_out != other._edges_out:
            return False
        if self._edges_in != other._edges_in:
            return False
        return True

In [None]:
class Graph:
    def __init__(self, vertices: Union[Dict[str, int], List[str]]):
        if isinstance(vertices, dict):
            if set(vertices.values()) != set(range(len(vertices))):
                raise ValueError("Vertex indices must be contiguous")
            self.label2idx: Dict[str, int] = vertices
        else:
            self.label2idx: Dict[str, int] = {n: i for i, n in enumerate(vertices)}
        self.idx2label: Dict[int, str] = {i: k for k, i in self.label2idx.items()}

    def __getitem__(self, key: Union[str, int]) -> Union[int, str]:
        if isinstance(key, int):
            return self.idx2label[key]
        return self.label2idx[key]

    def __contains__(self, key: Union[str, int]) -> bool:
        if isinstance(key, int):
            return key in self.idx2label
        return key in self.label2idx

    def _add_vertex(self, label: Optional[str]) -> str:
        if label is None:
            label = str(len(self.label2idx))
        if label in self.label2idx:
            raise ValueError(f"Node labels must be unique, label {label} is already in use.")
        idx = len(self.label2idx)
        self.label2idx[label] = idx
        self.idx2label[idx] = label
        return idx

    @property
    def density(self) -> float:
        return self.edge_count / (self.vertex_count * (self.vertex_count - 1))

    def __str__(self) -> str:
        return f"G(V={self.vertex_count}, E={self.edge_count}, D={self.density})"

    def to_dot(self, directed: bool = False, label_length: Optional[int] = None) -> str:
        raise NotImplementedError

In [None]:
class AdjacencyList(Graph):
    def __init__(self, vertices: Dict[str, int]):
        super().__init__(vertices)
        self._vertices: List[Vertex] = [Vertex(l) for l in self.label2idx]

    @property
    def vertices(self) -> List[Vertex]:
        return self._vertices

    def add_vertex(self, label: Optional[str]) -> int:
        idx = self._add_vertex(label)
        if idx != len(self.vertices):
            raise ValueError(
                "The added vertex has a label that is out of order, expected: {len(self.vertices)} found: {idx}"
            )
        self.vertices.append(Vertex(label))
        return idx

    def add_edge(self, source: Union[str, int], target: Union[str, int], weight: float = 1.0) -> None:
        if weight < 0.0:
            raise ValueError(f"Edge weight must be greater than zero, got {weight}")
        source_idx = source if isinstance(source, int) else self[source]
        target_idx = target if isinstance(target, int) else self[target]
        if source_idx == target_idx:
            raise ValueError(f"Self loops are not allowed, found edge with source and target if {source_idx}")
        source_vertex = self.vertices[source_idx]
        target_vertex = self.vertices[target_idx]
        source_vertex.edges_out[target_idx] = weight
        target_vertex.edges_in[source_idx] = weight

    @property
    def vertex_count(self) -> int:
        return len(self.vertices)

    @property
    def edge_count(self) -> int:
        return sum(v.degree_out for v in self.vertices)

    def print_graph(self, label_length: Optional[int] = None) -> None:
        print(str(self))
        for v in self.vertices:
            print(f"\tVertex {self[v.value]}: {v.value[:label_length]}")
            print(f"\t\tOutbound:")
            for idx, weight in v.edges_out.items():
                print(f"\t\t\t{self[v.value]} -> {idx}: {weight}")
            print(f"\t\tInbound:")
            for idx, weight in v.edges_in.items():
                print(f"\t\t\t{self[v.value]} <- {idx}: {weight}")

    def to_dot(self, directed: bool = False, label_length: Optional[int] = None) -> str:
        if directed:
            return self._to_directed_dot(label_length)
        return self._to_undirected_dot(label_length)

    def _to_directed_dot(self, label_length: Optional[int] = None) -> str:
        dot = ["digraph G {"]
        for v in self.vertices:
            dot.append(f'\t{self[v.value]} [label="{v.value[:label_length]}"];')
            for idx, weight in v.edges_out.items():
                dot.append(f'\t{self[v.value]} -> {idx} [label="{weight}"];')
        dot.append("}")
        return "\n".join(dot)

    def _to_undirected_dot(self, label_length: Optional[int] = None) -> str:
        dot = ["graph G {"]
        edges = set()
        for v in self.vertices:
            dot.append(f'\t{self[v.value]} [label="{v.value[:label_length]}"];')
            for idx, weight in v.edges_out.items():
                if (self[v.value], idx) in edges or (idx, self[v.value]) in edges:
                    continue
                dot.append(f'\t{self[v.value]} -- {idx} [label="{weight}"];')
                edges.add((self[v.value], idx))
        dot.append("}")
        return "\n".join(dot)

In [None]:
ConvergenceType = Enum("ConvergenceType", "ALL ANY")
def sum_edges(edges: Dict[str, float]) -> float:
    return sum(edges.values())

In [None]:
def accumulate_score(vertex: Vertex, ws: List[float], denom: List[float]):
    return math.fsum([weight / denom[edge] * ws[edge] for edge, weight in vertex.edges_in.items()])

In [None]:
def text_rank_init(
    graph: AdjacencyList, uniform: bool = True, seed: Optional[int] = None
) -> Tuple[List[float], List[float]]:
    random.seed(seed)
    denom = [sum_edges(v.edges_out) for v in graph.vertices] #this is where our vertices function design comes in handy
    # If the sum off all outgoing edges of V_j is 0.0 then the incoming edge from V_j to V_i will be 0.0
    # We can use anything as the denominator and the value will still be zero
    denom = [d if d != 0.0 else 1.0 for d in denom]
    if uniform:
        ws = [1 / len(graph.vertices) for _ in graph.vertices]
    else:
        ws = [random.random() for _ in graph.vertices]
        norm = sum(ws)
        ws = [w / norm for w in ws]
    return ws, denom

In [None]:
def text_rank_update(
    graph: AdjacencyList, ws: List[float], denom: List[float], damping: float = 0.85
) -> List[float]:
    updates = [accumulate_score(v, ws, denom) for v in graph.vertices]
    ws = [(1 - damping) + damping * update for update in updates]
    return ws

In [None]:
def text_rank_output(graph: AdjacencyList, ws: List[float]) -> List[Tuple[str, float]]:
    norm = sum(ws)
    ws = [w / norm for w in ws]
    return sorted(zip(map(lambda v: v.value, graph.vertices), ws), key=itemgetter(1), reverse=True)

In [None]:
def text_rank(
    graph: Graph,
    damping: float = 0.85,
    convergence: float = 0.0001,
    convergence_type: ConvergenceType = ConvergenceType.ALL,
    GraphType=AdjacencyList,
    niter: int = 200,
    uniform: bool = False,
    seed: Optional[int] = None,
) -> List[Tuple[str, float]]:
    if not 0 <= damping <= 1:
        raise ValueError(f"damping must be between `0` and `1`, got {damping}")
    converge = all if convergence_type is ConvergenceType.ALL else any

    ws_prev, denom = text_rank_init(graph, uniform=uniform, seed=seed)

    for _ in range(niter):
        ws = text_rank_update(graph, ws_prev, denom, damping)
        if converge(abs(p - c) < convergence for p, c in zip(ws_prev, ws)):
            break
        ws_prev = ws

    return text_rank_output(graph, ws)

In [None]:
def sentence_graph(
    sentences: List[str],
    sim: Callable[..., float] = overlap,
    norm: Callable[[str], str] = norm_sentence,
    GraphType: Type[Graph] = AdjacencyList,
) -> Tuple[Graph, Dict[str, List[int]]]:
    
    offsets = defaultdict(list)
    normed = [norm(sentence) for sentence in sentences]
    for i, norm in enumerate(normed):
        offsets[norm].append(i)

    vocab = build_vocab(normed)
    graph = GraphType(vocab)
    
    # we add edges in either direction (bidirectional) to simulate the undirected graph
    for (i, src), (j, tgt) in combinations(enumerate(normed), 2):
        graph.add_edge(src, tgt, sim(src, tgt, raw_s1=sentences[i], raw_s2=sentences[j], s1_idx=i, s2_idx=j))
        graph.add_edge(tgt, src, sim(tgt, src, raw_s1=sentences[j], raw_s2=sentences[i], s1_idx=j, s2_idx=i))
    #print(graph, offsets)
    return graph, offsets

In [None]:
def summarize(
    sentences: List[str],
    nsents: Optional[int] = None,
    keep_order: bool = True,
    damping: float = 0.85,
    convergence: float = 0.0001,
    convergence_type: ConvergenceType = ConvergenceType.ALL,
    GraphType=AdjacencyList,
    niter: int = 200,
    seed: Optional[int] = None,
    sim: Callable[..., float] = overlap,
    norm: Callable[[str], str] = norm_sentence,
) -> List[str]:
   
    graph, offsets = sentence_graph(sentences, sim, norm, GraphType)
    if nsents is None:
        nsents = len(sentences) // 3
    selected = text_rank(
        graph, damping=damping, convergence=convergence, convergence_type=convergence_type, niter=niter, seed=seed,
    )[:nsents]
    indices = [offsets[s[0]][0] for s in selected]
    if keep_order:
        return [sentences[i] for i in sorted(indices)]
    return [sentences[i] for i in indices]

In [None]:
summs = []
sents = []
sentences = []
for file_name in Path('./data').glob('*.json'):
    with open(file_name) as f:
        sentences.append(json.load(f))
for sents in sentences:
    summs.append(summarize(sents, 2))

In [None]:
print(sentences[0])

In [None]:
print(summs[0])