Source code for pyCP_APR.applications.tensor_anomaly_detection

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
tensor_anomaly_detection.py performs p-value scoring over the tensor decomposition, i.e.
the KRUSKAL tensor M. The calculated p-values are used to detect anomalies.\n
This method was introduced by Eren et al. in [1].

CyberToaster, Project 1, Summer 2020\n
Los Alamos National Laboratory\n

Anomaly detection using Tensors and their Decompositions.\n
Student: Maksim E. Eren\n
Primary Mentor: Juston Moore\n
Secondary Mentors: Boian Alexandrov and Patrick Avery

References
========================================
[1] M. E. Eren, J. S. Moore and B. S. Alexandro, "Multi-Dimensional Anomalous Entity Detection via Poisson Tensor Factorization," 2020 IEEE International Conference on Intelligence and Security Informatics (ISI), 2020, pp. 1-6, doi: 10.1109/ISI49825.2020.9280524.

@author: Maksim Ekin Eren, Juston S. Moore
"""
import sys
import numpy as np
import scipy.special as sc
from scipy.special import chdtrc as chi2_cdf
from scipy.stats import poisson


[docs]class PoissonTensorAnomaly(): """ Anomaly detection using Poisson Distribution and Canonical Polyadic (CP) with Alternating Poisson Regression tensor decomposition (CP-APR). Componenets of the CP-APR used to calculate the p-values for each instance through Poisson cumulative distribution function (cdf). p-values are then used to determine if the event is an anomaly. Lower p-values are more anomalous. v2: Utilizes Numpy vectorization for the calculations. References:\n 1) Chi, Eric C. and Tamara G. Kolda. “On Tensors, Sparsity, and Nonnegative Factorizations.” SIAM J. Matrix Anal. Appl. 33 (2012): 1272-1299.\n 2) Turcotte, Melissa J. M. et al. “Unified Host and Network Data Set. ” ArXiv abs/1708.07518 (2017): n. pag.\n 3) Wikipedia contributors. "Poisson distribution." Wikipedia, The Free Encyclopedia. Wikipedia, The Free Encyclopedia, 29 Jun. 2020. Web. 6 Jul. 2020. """ # LIBRARY INFORMATION __version_info__ = ('2', '1', '0') __version__ = '.'.join(__version_info__) __author__ = "Maksim E. Eren, Juston Moore" __info__ = "Anomaly detection using Poisson Distribution and Canonical Polyadic (CP) \ with Alternating Poisson Regression tensor decomposition (CP-APR). This version utilize \ Numpy vectorization in the calculations." def __init__(self, dimensions=dict(), weights=list(), objective='p_value', lambda_method='single_tensor', p_value_fusion_index=[0], ensemble_dimensions=dict(), ensemble_weights=list(), ensemble_significance=[0.1, 0.9], mode_weights=[1], ignore_dimensions_indx=[]): r""" Initilize the anomaly detector class. Parameters ---------- dimensions : dict, required Components of the KRUSKAL Tensor Decomposition. The default is dict.\n Each element is a dimension (factors of a component) and each dimension has (nxK) elements for that factor for rank K. weights : list, required Weights of each component of parameter dimensions. The default is list. objective : string, optional What to calculate.\n Options: p_value, p_value_fusion_harmonic, p_value_fusion_harmonic_observed, p_value_fusion_chi2, p_value_fusion_chi2_observed, p_value_fusion_arithmetic, log_likelihood The default is 'p_value'. lambda_method : string, optional How to calculate lambda.\n If 'single_tensor', it will use single ktensor passed in dimensions when calculating lambda.\n If 'ensemble', it will use two ktensors where parameter dimensions is a K>=1 rank tensor with lambda weight ensemble_significance[0] and parameter ensemble_dimensions is a ktensor with K>1 rank tensor with lambda weight ensemble_significance[1]. The default is 'single_tensor'. p_value_fusion_index : list Index to fix, or calculate the p-value fusions. Only used when objective is set to p_value_fusion. The default is [0]. ensemble_dimensions : dict, optional Components of the KRUSKAL Tensor Decomposition.\n Each element is a dimension (factors of a component) and each dimension has (nxK) elements for that factor for rank K.\n This is the second ktensor dimension passed. It will be used if lambda_method is set to 'ensemble'. Its lambda weight is ensemble_significance[1]. The default is dict(). ensemble_weights : list, optional Weights of each component of ensemble_dimensions. The default is list(). Only used if lambda_method is 'ensemble'. ensemble_significance : list, optional lambda weight of each ktensor when using 'ensemble' lambda_method.\n Weight of dimensions: ensemble_significance[0]\n. Weight of ensemble_dimensions: ensemble_significance[1]\n The default is [0.1, 0.9]. mode_weights : list, optional Weight of each dimension.\n The default is [1]. ignore_dimensions_indx : list, optional If any dimension in latent factors should be ignored when calculating the lambdas.\n The default is []. """ # Store KRUSKAL Tensor componenets # Each index in dimensions is a dimension from the tensor # and each dimension consist of that dimension's values and Ranks self.dimensions = list() for _, value in dimensions.items(): numpy_array = np.array([np.array(xi) for xi in value]) self.dimensions.append(numpy_array) # K, Rank of the tensor # if K = 1 if len(self.dimensions[0].shape) == 1: # Component weights self.weights = np.array([weights]) # Rank self.K = 1 else: # Component weights self.weights = np.array(weights) # Rank self.K = self.dimensions[0].shape[1] # D, Order or mode of the tensor. Number of dimensions # There are D factors within each K components self.D = len(self.dimensions) # Tensor/Dimensions Size self.dimensions_size = list() for dimension in self.dimensions: self.dimensions_size.append(dimension.shape[0]) # How to calculate the scores avail_objectives = ['p_value', \ 'p_value_fusion_harmonic', 'p_value_fusion_harmonic_observed', \ 'p_value_fusion_arithmetic', \ 'p_value_fusion_chi2', 'p_value_fusion_chi2_observed', \ 'log_likelihood'] if objective not in avail_objectives: sys.exit('Invalid objective. Available methods: ' + ','.join(avail_objectives)) self.objective = objective # Lamda calculation method avail_lambda_methods = ['single_tensor', 'ensemble'] if objective not in avail_objectives: sys.exit('Invalid lambda method. Available methods: ' + ','.join(avail_lambda_methods)) self.lambda_method = lambda_method # ensemble lambda will have multiple tensors with different ranks to be used # when calculating the lambda self.ensemble_dimensions = list() self.ensemble_weights = list() self.ensemble_significance = ensemble_significance self.K_ensemble = -1 # extract dimensions from each tensor in the bag if self.lambda_method == 'ensemble': for _, value in ensemble_dimensions.items(): numpy_array = np.array([np.array(xi) for xi in value]) self.ensemble_dimensions.append(numpy_array) if len(self.ensemble_dimensions[0].shape) == 1: self.ensemble_weights = np.array([ensemble_weights]) self.K_ensemble = 1 else: self.ensemble_weights = np.array(ensemble_weights) self.K_ensemble = self.ensemble_dimensions[0].shape[1] self.p_value_fusion_index = p_value_fusion_index # Weight of each dimension if len(mode_weights) > 1: if len(mode_weights) > len(self.dimensions): sys.exit('Mode weights must be same number of dimensions.') self.mode_weights = mode_weights else: self.mode_weights = mode_weights * len(self.dimensions) # Dimensions to ignore when calculating the lambdas self.ignore_dimensions_indx = ignore_dimensions_indx
[docs] def predict(self, coords, values, from_matlab=False): ''' Get the scores using the KRUSKAL components given the non-zero coordinates and values and the objective. Parameters ---------- coords : list of list Coordinates of the non-zero elements within the sparse tensor. values : list Non-zero values that are in the sparse tensor. from_matlab : bool Set True if need to substract 1 to the coordinates, since matlab starts at 1.\n The default is False. Returns ------- prediction : dict Dictionary of calculated objective. ''' # change coords to a numpy array or arrays coords = np.array([np.array(xi) for xi in coords]) # change values to a numpy array values = np.array(values) # substract 1 from each element since for Matlab +1 was done if from_matlab: coords -= 1 # calculate Poisson Lambda values for each instance, or event lambdas = self.__get_lambdas(coords) # calculate the p_values if self.objective == 'p_value': scores = self.__get_p_values(lambdas, values) # calculate p_value fusion with harmonic mean elif self.objective == 'p_value_fusion_harmonic': p_values = self.__get_p_values(lambdas, values) scores = self.__get_p_value_fusion_harmonic(p_values, coords) # calculate p_value fusion with harmonic mean for observed events elif self.objective == 'p_value_fusion_harmonic_observed': p_values = self.__get_p_values(lambdas, values) scores = self.__get_p_value_fusion_harmonic_observed(p_values, coords) # calculate p_value fusion with arithmetic mean elif self.objective == 'p_value_fusion_arithmetic': p_values = self.__get_p_values(lambdas, values) scores = self.__get_p_value_fusion_arithmetic(p_values, coords) # cakculate p_value fusion with chi-squared test elif self.objective == 'p_value_fusion_chi2': p_values = self.__get_p_values(lambdas, values) scores = self.__get_p_value_fusion_chi2(p_values, coords) # cakculate p_value fusion with chi-squared test for observed events elif self.objective == 'p_value_fusion_chi2_observed': p_values = self.__get_p_values(lambdas, values) scores = self.__get_p_value_fusion_chi2_observed(p_values, coords) # calculate log_likelihood elif self.objective == 'log_likelihood': scores = self.__get_log_likelihood(lambdas, coords, values) # prepare the return value prediction = dict() prediction[self.objective] = scores prediction['lambda'] = lambdas return prediction
def __get_log_likelihood(self, lambdas, coords, values): ''' Calculate the log-likelihood given the Poisson lambda values, coordinates of the non-zero elements, and list of non-zero elements. Parameters ---------- lambdas : numpy array array of Poisson lambdas. coords : numpy array Coordinates of the non-zero elements within the sparse tensor. values : numpy array non-zero values of a tensor. Returns ------- log_likelihood : float log-likelihood score of given coordinates. ''' # Calculate the non-zero log sum log_sum_nnz = 0 for index, lambd in enumerate(lambdas): log_sum_nnz += ((values[index] * np.log(lambd)) - sc.gammaln(values[index] + 1)) # Calculate Lambda (capital lambda) theta = 1 for dimension_index, dimension in enumerate(self.dimensions): if self.K == 1: # if rank is 1, no components to sum theta *= (dimension[coords[:, dimension_index]] * self.weights) else: theta *= (dimension[coords[:, dimension_index]] * self.weights).sum(axis=1) Lambda = theta.sum() if self.lambda_method == 'ensemble': # main ktensor's lambdas weight Lambda *= self.ensemble_significance[0] # Calculate Lambda_2 (capital lambda 2), Lambda for the secondary tensor theta = 1 for dimension_index, dimension in enumerate(self.ensemble_dimensions): if self.K_ensemble == 1: # if rank is 1, no components to sum theta *= (dimension[coords[:, dimension_index]] * self.ensemble_weights) else: theta *= (dimension[coords[:, dimension_index]] * self.ensemble_weights).sum(axis=1) # Add Lambda_2 to Lambda Lambda += (theta.sum() * self.ensemble_significance[1]) log_likelihood = log_sum_nnz - Lambda return log_likelihood def __get_p_value_fusion_arithmetic(self, p_values, coords): ''' Calculate p_value for instances in dimension 'p_value_fusion_index' using p_value fusion with arithmetic mean over the all tensor (including zeros). Parameters ---------- p_values : list List of Poisson p-values. coords : numpy array Coordinates of the non-zero elements within the sparse tensor. Returns ------- p_value_fusions : list p_value for each instance in dimension 'p_value_fusion_index' ''' p_value_fusions = list() # unique array of target instances to calculate p_value fussion unfold_target = np.unique(coords[:, self.p_value_fusion_index], axis=0) # unfold size is the tensor size unfolding at index 'p_value_fusion_index' # multiply dimension sizes ignoring the unfolding dimension 'p_value_fusion_index' unfold_size = np.delete(self.dimensions_size, self.p_value_fusion_index).prod() # calculate p-value for each target for target in unfold_target: # coordinates of the current target nnz_indices = np.where((coords[:, self.p_value_fusion_index] == target).all(axis=1)) # remove the current target from bag for speed improvment on next iteration coords = np.delete(coords, nnz_indices, axis=0) target_p_value_sum = p_values[nnz_indices].sum() # total number of zero elements stored in the tensor at the current unfolding total_zero = unfold_size - len(nnz_indices) arithmetic_p_value = (target_p_value_sum + total_zero) / unfold_size p_value_fusions.append(arithmetic_p_value) return p_value_fusions def __get_p_value_fusion_harmonic(self, p_values, coords): ''' Calculate p_value for instances in dimension 'p_value_fusion_index' using p_value fusion with harmonic mean over the all tensor (including zeros). Parameters ---------- p_values : list List of Poisson p-values. coords : numpy array Coordinates of the non-zero elements within the sparse tensor. Returns ------- p_value_fusions : list p_value for each instance in dimension 'p_value_fusion_index' ''' p_value_fusions = list() # unique array of target instances to calculate p_value fussion unfold_target = np.unique(coords[:, self.p_value_fusion_index], axis=0) # unfold size is the tensor size unfolding at index 'p_value_fusion_index' # multiply dimension sizes ignoring the unfolding dimension 'p_value_fusion_index' unfold_size = np.delete(self.dimensions_size, self.p_value_fusion_index).prod() # calculate p-value for each target for target in unfold_target: # coordinates of the current target nnz_indices = np.where((coords[:, self.p_value_fusion_index] == target).all(axis=1)) # remove the current target from bag for speed improvment on next iteration coords = np.delete(coords, nnz_indices, axis=0) target_p_value_sum = (1 / p_values[nnz_indices]).sum() # total number of zero elements stored in the tensor at the current unfolding total_zero = unfold_size - len(nnz_indices) harmonic_p_value = unfold_size / (target_p_value_sum + total_zero) p_value_fusions.append(harmonic_p_value) return p_value_fusions def __get_p_value_fusion_harmonic_observed(self, p_values, coords): ''' Calculate p_value for instances in dimension 'p_value_fusion_index' using p_value fusion with harmonic mean over the observed events only. Parameters ---------- p_values : list List of Poisson p-values. coords : numpy array Coordinates of the non-zero elements within the sparse tensor. Returns ------- p_value_fusions : list p_value for each instance in dimension 'p_value_fusion_index' ''' p_value_fusions = list() # unique array of target instances to calculate p_value fussion unfold_target = np.unique(coords[:, self.p_value_fusion_index], axis=0) # calculate p-value for each target for target in unfold_target: # coordinates of the current target nnz_indices = np.where((coords[:, self.p_value_fusion_index] == target).all(axis=1)) # remove the current target from bag for speed improvment on next iteration coords = np.delete(coords, nnz_indices, axis=0) target_p_value_sum = (1 / p_values[nnz_indices]).sum() harmonic_p_value = len(nnz_indices) / target_p_value_sum p_value_fusions.append(harmonic_p_value) return p_value_fusions def __get_p_value_fusion_chi2_observed(self, p_values, coords): ''' Calculate p-value fusion using Fisher's test over observed events Parameters ---------- p_values : list List of Poisson p-values. coords : numpy array Coordinates of the non-zero elements within the sparse tensor. Returns ------- p_value_fusions : list p_value for each instance in dimension 'p_value_fusion_index' ''' p_value_fusions = list() # unique array of target instances to calculate p_value fussion unfold_target = np.unique(coords[:, self.p_value_fusion_index], axis=0) # calculate p-value for each target for target in unfold_target: # coordinates of the current target nnz_indices = np.where((coords[:, self.p_value_fusion_index] == target).all(axis=1)) # remove the current target from bag for speed improvment on next iteration coords = np.delete(coords, nnz_indices, axis=0) statistic = -2 * (np.log(p_values[nnz_indices]).sum()) k = len(nnz_indices) # chi2_p_value = chi2.cdf(2*k, statistic) chi2_p_value = chi2_cdf(2 * k, statistic) p_value_fusions.append(chi2_p_value) return p_value_fusions def __get_p_value_fusion_chi2(self, p_values, coords): ''' Calculate p-value fusion using Fisher's test Parameters ---------- p_values : list List of Poisson p-values. coords : numpy array Coordinates of the non-zero elements within the sparse tensor. Returns ------- p_value_fusions : list p_value for each instance in dimension 'p_value_fusion_index' ''' p_value_fusions = list() # unique array of target instances to calculate p_value fussion unfold_target = np.unique(coords[:, self.p_value_fusion_index], axis=0) # unfold size is the tensor size unfolding at index 'p_value_fusion_index' # multiply dimension sizes ignoring the unfolding dimension 'p_value_fusion_index' unfold_size = np.delete(self.dimensions_size, self.p_value_fusion_index).prod() # calculate p-value for each target for target in unfold_target: # coordinates of the current target nnz_indices = np.where((coords[:, self.p_value_fusion_index] == target).all(axis=1)) # remove the current target from bag for speed improvment on next iteration coords = np.delete(coords, nnz_indices, axis=0) statistic = -2 * (np.log(p_values[nnz_indices]).sum()) # chi2_p_value = chi2.cdf(2*k, statistic) chi2_p_value = chi2_cdf(2 * unfold_size, statistic) p_value_fusions.append(chi2_p_value) return p_value_fusions def __get_p_values(self, lambdas, values): ''' Calculate the Poisson p-value for the given x (number of occurrences values[i]) and Poisson lambda (lambdas[i]) Parameters ---------- lambdas : numpy array array of Poisson lambdas. values : numpy array non-zero values of a tensor. Returns ------- p_values : list List of Poisson p-values. ''' p_value_calculator = lambda value, lamb: (1 - poisson.cdf(int(value) - 1, lamb)) p_value_func = np.vectorize(p_value_calculator) p_values = p_value_func(values, lambdas) return p_values def __get_lambdas(self, coords): ''' Calculate lambda, average number of occurences for Poisson Distribution. Parameters ---------- coords : numpy array Coordinates of the non-zero elements within the sparse tensor. Returns ------- lambdas : numpy array array of Poisson lambda values. ''' theta = 1 for dimension_index, dimension in enumerate(self.dimensions): if dimension_index not in self.ignore_dimensions_indx: theta *= (dimension[coords[:, dimension_index]]) * self.mode_weights[dimension_index] theta *= self.weights if self.K == 1: # if rank is 1, no components to sum lambdas = theta else: lambdas = theta.sum(axis=1) if self.lambda_method == 'ensemble': # main ktensor's lambdas weight lambdas *= self.ensemble_significance[0] # calculate the secondary tensor's lambdas theta = 1 for dimension_index, dimension in enumerate(self.ensemble_dimensions): if dimension_index not in self.ignore_dimensions_indx: theta *= dimension[coords[:, dimension_index]] theta *= self.ensemble_weights # combine lambdas if self.K_ensemble == 1: # if rank is 1, no components to sum lambdas += (theta * self.ensemble_significance[1]) else: lambdas += (theta.sum(axis=1) * self.ensemble_significance[1]) return lambdas