Assumptions:

    - Network topology and attacker position in the network always known
    - The network components which are not involved in the attack phase are to be considered benign
    - Malicious traffic is mixed to benign (legitimate access to mail servers, etc.)
Note:

    - Requires the tstat flow files (path for the csv files is local)
    - We remove IP 10.9.0.X (mananging address from other network - ignore)
    - Flows with FQDN containing wpscan represent the tool update at the attacker's side

In [447]:
import pandas as pd
from glob import glob
import numpy as np
import yaml
import json
from datetime import datetime, timedelta
from difflib import SequenceMatcher
from functools import reduce

In [448]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Label TCP complete 
TCP flows with complete handshake and teardown, mostly with actual payload exchange.

In [449]:
def label_benign(x):
    
    if (x["role_serv"] == "external"):
        # browser traffic towards the internet
        return "browsing/update"
    elif (x["role_cli"] == "cloud_share") and (x["role_serv"] == "internal_share"):
        # standard communication between cloud share and internal share
        return "benign_share"
    elif (("mail" in x["role_cli"]) | ("inet-firewall" in x["role_cli"] )) and (("mail" in x["role_serv"]) | ("inet-firewall" in x["role_serv"] )) and (x["s_port:16"] in [25, 465, 587, 993]):
        # communication between mailservers on standard mail well-known ports - some traffic is passigng through the fw
        return "mail"
    elif x["role_serv"] == "monitoring":
        # communication to monitoring address
        return "monitoring"
    elif ("internal_employee" in x["role_cli"]) & (x["role_serv"] == "intranet_server"):
        # communication to internal web server (port 80 or 443)
        return "HTTP(S) intra"
    elif ("internal_employee" in x["role_cli"]) & (x["role_serv"] == "webserver"):
        # communication to DMZ web server (port 80 or 443)
        return "HTTP(S) DMZ"
    elif (x["s_port:16"] == 22) & ((x["role_serv"] == "mail") | (x["role_serv"] == "intranet_server") | (x["role_serv"] == "cloud_share")) & (x["role_cli"] != "attacker_0"):
        # users' login with SSH protocol
        return "SSH"
    elif ((x["s_port:16"] == 443) |(x["s_port:16"] == 80))  & (x["role_cli"] == "webserver") & ((x["role_serv"] == "intranet_server") | (x["role_serv"] == "webserver") | (x["role_serv"] == "cloud_share") | (x["role_serv"] == "mail")):
        # webserver acting as a http or https proxy
        return "proxy"
    else:
        return ""
    
        

In [450]:
def label_port_based(x):
    # Bening flows not belonging to the categories defined in label_benign - to run after having labelled the attack phases
    if x["label"] == "":
        if x["s_port:16"] == 443:
            return "HTTPS"
        elif x["s_port:16"] == 80:
            return "HTTP"
        elif x["s_port:16"] == 53:
            return "DNS"
        elif x["s_port:16"] > 1024:
            return "update/command on unassigned port"
    else:
        return x["label"]

In [451]:
company_name = "russellmitchell"

In [452]:
simulation_list = pd.read_csv("simulation_lists.csv") #downloaded from dataset webpage - check format_dataset_info.ipynb to generate file

In [453]:
simulation_list = simulation_list.set_index("Dataset")

In [454]:
times = simulation_list.loc[company_name, "Simulation time"].split(" - ")

In [455]:
start_date = times[0].replace(" ", "T")
end_date = times[1].replace(" ", "T")

In [456]:
complete_filelist = glob("./{}_logs/log*/*.out/log_tcp_complete".format(company_name), recursive=True)

In [457]:
tcp_complete = pd.concat([pd.read_csv(filename, sep=" ") for filename in complete_filelist])
tcp_complete.shape

(139675, 130)

In [458]:
ip_to_filter = simulation_list.loc[company_name, "IP to filter"] #different for every company - also taken from the simulation_list.csv

In [459]:
tcp_complete.shape

(139675, 130)

In [460]:
tcp_complete = tcp_complete[(tcp_complete["#15#c_ip:1"] != ip_to_filter) & (tcp_complete["s_ip:15"] != ip_to_filter)]

In [461]:
tcp_complete.shape

(139442, 130)

In [462]:
tcp_complete["timestamp"] = pd.to_datetime(tcp_complete["first:29"], unit='ms')
tcp_complete = tcp_complete[(tcp_complete["timestamp"] > start_date) & (tcp_complete["timestamp"] < end_date)]
tcp_complete.shape

(123332, 131)

In [463]:
with open('.\{}\processing\config\servers.yaml'.format(company_name)) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    servers = yaml.load(file, Loader=yaml.FullLoader)

In [464]:
df_roles = pd.DataFrame([], columns = ["role", "ipv4_address", "network"])
networks = ["internet", "dmz", "intranet", "firewall"]

In [465]:
for key in servers.keys():
    
    to_append = {} # create df instance: role, ip, network (if more IPs are assigned to the same role, create multiple instances e.g. firewall)
    to_append["role"] = key 
    
    groups = servers[key]["groups"]
    
    for group in groups:
        if group in networks:
            to_append["network"] = group
    
    addr = servers[key]["ipv4_addresses"]
    if len(addr) == 1:
        to_append["ipv4_address"] = addr[0]
        df_roles = df_roles.append(to_append, ignore_index=True)
    else:
        for ip in addr:
            to_append["ipv4_address"] = ip
            df_roles = df_roles.append(to_append, ignore_index=True)
    
    

In [466]:
tcp_complete = tcp_complete.merge(df_roles, right_on="ipv4_address", left_on="#15#c_ip:1", how="left").fillna("external")
tcp_complete = tcp_complete.rename(columns={"role":"role_cli", "ipv4_address": "ipv4_address_cli", "network":"network_cli"})

tcp_complete = tcp_complete.merge(df_roles, right_on="ipv4_address", left_on="s_ip:15", how="left").fillna("external")
tcp_complete = tcp_complete.rename(columns={"role":"role_serv", "ipv4_address": "ipv4_address_serv", "network":"network_serv"})

In [467]:
tcp_complete["label"] = tcp_complete.apply(label_benign, axis=1)

Label attack phases

In [468]:
with open('./{}/processing/config/attacker/foothold.yaml'.format(company_name)) as file:
    foothold = yaml.load(file, Loader=yaml.FullLoader)

In [469]:
foothold.keys()

dict_keys(['start', 'stop', 'vpn_connect', 'traceroute', 'host_discover_local', 'host_discover_dmz', 'dns_brute_force_start', 'service_scan', 'wpscan', 'dirb_scan', 'upload_rce_shell', 'host_recon_commands', 'online_cracking', 'run_cracking'])

In [470]:
pwd_cracking = simulation_list.loc[company_name, "Password cracking"]

In [471]:
if pwd_cracking == "No":
    keys_to_pop = ["start", "stop", 'online_cracking', 'run_cracking']
    for pop in keys_to_pop: 
        foothold.pop(pop, None)
else:
    keys_to_pop = ["start", "stop"]
    for pop in keys_to_pop: 
        foothold.pop(pop, None)

In [472]:
foothold.keys()

dict_keys(['vpn_connect', 'traceroute', 'host_discover_local', 'host_discover_dmz', 'dns_brute_force_start', 'service_scan', 'wpscan', 'dirb_scan', 'upload_rce_shell', 'host_recon_commands', 'online_cracking', 'run_cracking'])

In [473]:
for k in foothold.keys():
        
    if k != "host_recon_commands":
        
        timestamp_start = datetime.strptime(foothold[k]["start"], "%Y-%m-%dT%H:%M:%S.%fZ")
        timestamp_stop = datetime.strptime(foothold[k]["stop"], "%Y-%m-%dT%H:%M:%S.%fZ")
        
        tcp_complete.loc[(tcp_complete["timestamp"] > timestamp_start) & (tcp_complete["timestamp"] < timestamp_stop) & (tcp_complete["label"] == ""),"label"] = k
    else:
        commands_list = foothold[k]

        for command in commands_list:
            timestamp_start = datetime.strptime(command["start"], "%Y-%m-%dT%H:%M:%S.%fZ")
            timestamp_stop = datetime.strptime(command["stop"], "%Y-%m-%dT%H:%M:%S.%fZ")
            tcp_complete.loc[(tcp_complete["timestamp"] > timestamp_start) & (tcp_complete["timestamp"] < timestamp_stop) & (tcp_complete["label"] == ""),"label"] = command["id"]


vpn_connect
traceroute
host_discover_local
host_discover_dmz
dns_brute_force_start
service_scan
wpscan
dirb_scan
upload_rce_shell
online_cracking
run_cracking


In [474]:
tcp_complete["label"] = tcp_complete.apply(label_port_based, axis=1)

In [475]:
tcp_complete.label.value_counts()

proxy                                48468
HTTPS                                26261
browsing/update                      25822
HTTP(S) DMZ                          12622
benign_share                          6208
mail                                  1112
HTTP(S) intra                          910
HTTP                                   785
monitoring                             484
service_scan                           375
dirb_scan                               90
wpscan                                  76
SSH                                     24
host_discover_dmz                       14
online_cracking                         11
DNS                                      5
host_discover_local                      4
list_www                                 2
check_ps_a                               2
check_netstat_l                          2
clear                                    2
check_wp_config                          2
check_uptime                             2
read_passwd

### Label TCP no-complete
TCP broken flows or scanning attempts

In [476]:
nocomplete_filelist = glob("./{}_logs/log*/*.out/log_tcp_nocomplete".format(company_name), recursive=True)

In [477]:
tcp_nocomplete = pd.concat([pd.read_csv(filename, sep=" ") for filename in nocomplete_filelist])
tcp_nocomplete.shape

(378137, 44)

In [478]:
tcp_nocomplete = tcp_nocomplete[(tcp_nocomplete["#15#c_ip:1"] != ip_to_filter) & (tcp_nocomplete["s_ip:15"] != ip_to_filter)]
tcp_nocomplete.shape

(154929, 44)

In [479]:
tcp_nocomplete["timestamp"] = pd.to_datetime(tcp_nocomplete["first:29"], unit='ms')
tcp_nocomplete = tcp_nocomplete[(tcp_nocomplete["timestamp"] > start_date) & (tcp_nocomplete["timestamp"] < end_date)]
tcp_nocomplete.shape

(154877, 45)

In [480]:
tcp_nocomplete = tcp_nocomplete.merge(df_roles, right_on="ipv4_address", left_on="#15#c_ip:1", how="left").fillna("external")
tcp_nocomplete = tcp_nocomplete.rename(columns={"role":"role_cli", "ipv4_address": "ipv4_address_cli", "network":"network_cli"})

tcp_nocomplete = tcp_nocomplete.merge(df_roles, right_on="ipv4_address", left_on="s_ip:15", how="left").fillna("external")
tcp_nocomplete = tcp_nocomplete.rename(columns={"role":"role_serv", "ipv4_address": "ipv4_address_serv", "network":"network_serv"})

In [481]:
tcp_nocomplete["label"] = ""

In [482]:
for k in foothold.keys():
    
    if (k != "start") & (k != "stop"):
        
        if k != "host_recon_commands":
        
            timestamp_start = datetime.strptime(foothold[k]["start"], "%Y-%m-%dT%H:%M:%S.%fZ")
            timestamp_stop = datetime.strptime(foothold[k]["stop"], "%Y-%m-%dT%H:%M:%S.%fZ")
            
            
            tcp_nocomplete.loc[(tcp_nocomplete["timestamp"] > timestamp_start) & (tcp_nocomplete["timestamp"] < timestamp_stop) & (tcp_nocomplete["label"] == ""),"label"] = k
        else:
            commands_list = foothold[k]
            
            for command in commands_list:
                timestamp_start = datetime.strptime(command["start"], "%Y-%m-%dT%H:%M:%S.%fZ")
                timestamp_stop = datetime.strptime(command["stop"], "%Y-%m-%dT%H:%M:%S.%fZ")
                tcp_nocomplete.loc[(tcp_nocomplete["timestamp"] > timestamp_start) & (tcp_nocomplete["timestamp"] < timestamp_stop) & (tcp_nocomplete["label"] == ""),"label"] = command["id"]
                

In [483]:
tcp_nocomplete["label"] = tcp_nocomplete["label"].apply(lambda x: "broken flow - benign" if x == "" else x)

In [484]:
tcp_nocomplete.label.value_counts()

host_discover_local      131064
service_scan              22737
dns_brute_force_start       813
broken flow - benign        256
host_discover_dmz             7
Name: label, dtype: int64

### Label UDP
All UDP flows (defined by standard timeout on Tstat)

In [485]:
def label_udp(x):
    # benign labelling based on port (useful for configuration protocols on standard ports and QUIC browsing)
    
    if x["s_port:11"] == 123:
        return "NTP"
    elif (x["s_port:11"] == 67) | (x["s_port:11"] == 68):
        return "bootp" #dhcp
    elif (x["s_port:11"] == 1194) | (x["c_port:2"] == 1194):
        return "OpenVPN" #
    elif (x["s_port:11"] == 3478) | (x["c_port:2"] == 3478) | (x["s_port:11"] == 19302) | (x["c_port:2"] == 19302):
        return "STUN VoIP" #
    elif (x["s_port:11"] == 137) | (x["s_port:11"] == 138):
        return "NetBIOS"
    elif (x["s_port:11"] == 443) & (x["role_serv"] == "external"):
        return "QUIC"
    elif (x["s_ip:10"] == "224.0.0.251") & (x["s_port:11"] == 5353):
        return "mDNS"
    elif (x["s_ip:10"] == "ff02::00fb") & (x["s_port:11"] == 5353):
        return "mDNSv6"
    elif (x["s_ip:10"] == "8.8.8.8"):
        return "GoogleDNS"
    elif (x["s_ip:10"] == "127.0.0.53"):
        return "systemd-resolver (local)"
    elif (x["role_cli"] != "attacker_0") & (x["role_serv"] != "attacker_0") & ((x["s_port:11"] == 53) | (x["c_port:2"] == 53)):
        return "benign DNS"
    elif (x["role_cli"] == "attacker_0") & (x["s_port:11"] == 53):
        return "benign DNS - attacker requesting updates"
    else:
        return ""

In [486]:
udp_filelist = glob("./{}_logs/log*/*.out/log_udp_complete".format(company_name), recursive=True)

In [487]:
udp = pd.concat([pd.read_csv(filename, sep=" ") for filename in udp_filelist])
udp.shape

(112776, 19)

In [488]:
udp = udp[(udp["#c_ip:1"] != ip_to_filter) & (udp["s_ip:10"] != ip_to_filter)]
udp.shape

(112608, 19)

In [489]:
udp["timestamp"] = pd.to_datetime(udp["c_first_abs:3"], unit='ms')
udp = udp[(udp["timestamp"] > start_date) & (udp["timestamp"] < end_date)]
udp.shape

(99511, 20)

In [490]:
udp["rounded_timestamp"] = udp["timestamp"].dt.round("1s")

In [491]:
udp = udp.merge(df_roles, right_on="ipv4_address", left_on="#c_ip:1", how="left").fillna("external")
udp = udp.rename(columns={"role":"role_cli", "ipv4_address": "ipv4_address_cli", "network":"network_cli"})

udp = udp.merge(df_roles, right_on="ipv4_address", left_on="s_ip:10", how="left").fillna("external")
udp = udp.rename(columns={"role":"role_serv", "ipv4_address": "ipv4_address_serv", "network":"network_serv"})

In [492]:
udp.shape

(99511, 27)

In [493]:
udp["check_loops"] = udp.apply(lambda x: (x["role_cli"] == "attacker_0") & (x["role_serv"] == "attacker_0"), axis=1)

In [494]:
udp = udp[udp["check_loops"] == False]

In [495]:
udp = udp.drop("check_loops", axis=1)

In [496]:
udp["label"] = udp.apply(label_udp, axis=1)

In [497]:
udp[udp["label"] == ""].groupby(["#c_ip:1", "role_cli", "s_ip:10", "role_serv","s_port:11", "label"]).size()

#c_ip:1          role_cli  s_ip:10          role_serv   s_port:11  label
192.168.231.254  inet-dns  192.168.230.122  attacker_0  53                  15778
dtype: int64

In [498]:
udp["label"] = udp["label"].apply(lambda x: x if x != "" else "data exfiltration")

In [499]:
udp["label"].value_counts()

benign DNS                                  69211
data exfiltration                           15778
NTP                                          6213
NetBIOS                                      3508
mDNS                                         1577
mDNSv6                                       1111
QUIC                                          850
bootp                                         717
systemd-resolver (local)                      344
OpenVPN                                       137
benign DNS - attacker requesting updates       44
GoogleDNS                                       5
Name: label, dtype: int64

### Save files

In [501]:
tcp_complete.to_csv("./{}/labelled_logs/tcp_complete.csv".format(company_name), index=False)
tcp_nocomplete.to_csv("./{}/labelled_logs/tcp_nocomplete.csv".format(company_name), index=False)
udp.to_csv("./{}/labelled_logs/udp_complete.csv".format(company_name), index=False)