#!/usr/bin/env python # coding: utf-8 # In[1]: import os import array import math import pickle # import joblib import sys import argparse import pandas as pd import numpy as np import matplotlib.pyplot as plt from datetime import datetime from pprint import pprint import ssl from elasticsearch.connection import create_ssl_context from elasticsearch import Elasticsearch from elasticsearch import helpers import urllib3 # In[3]: import pandas as pd import numpy as np from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import association_rules, fpgrowth from prefixspan import PrefixSpan # In[4]: ssl_context = create_ssl_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE # In[12]: es = Elasticsearch(hosts=[{'host': '223.194.92.152', 'port': 9200}], scheme="http",verify_certs=False, timeout=300, ssl_context=ssl_context, http_auth=("elasticsearch", "hadoop2019@!@#$")) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # In[347]: ######## 2020, 1 year ######## ######## There are no MTM data in 2018, 2019 ######## body = { "size" : 10000, "query": { "range":{ "TW_COLLECT_DT":{ "gte":"2020-01-01T00:00:00.625+09:00", "lte":"2020-12-31T00:00:00.625+09:00" ################ } } }, "sort":[{ "_id":"asc" }] } res = es.search(index = 'ts_data_accident-2020', body=body) data = res['hits']['hits'] nxt=res["hit"]["hit"][-1]["sort"][0] total = res['hits']['total'] # print(total) accident = [] for da in data: att_type = da['_source'] # att_type["POL_NM"]=att_type["SCEN_INFOS"][0]["POL_NM"] accident.append(att_type) # df = pd.DataFrame(accident,dtype=str) df_10000 = pd.DataFrame(accident) print(df_10000.head()) # In[ ]: ######## 2020, 1 year ######## ######## There are no MTM data in 2018, 2019 ######## body = { "size" : 10000, "search_after":[nxt], "query": { "range":{ "TW_COLLECT_DT":{ "gte":"2020-01-01T00:00:00.625+09:00", "lte":"2020-12-31T00:00:00.625+09:00" ################ } } }, "sort":[{ "_id":"asc" }] } res = es.search(index = 'ts_data_accident-2020', body=body) data = res['hits']['hits'] nxt=res["hit"]["hit"][-1]["sort"][0] total = res['hits']['total'] # print(total) accident = [] for da in data: att_type = da['_source'] # att_type["POL_NM"]=att_type["SCEN_INFOS"][0]["POL_NM"] accident.append(att_type) # df = pd.DataFrame(accident,dtype=str) df_20000 = pd.DataFrame(accident) print(df_20000.head()) # In[348]: df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna() len(df) df.head() # In[349]: ##################### NTM section ##################### # In[350]: NTM_df=df[df['ACCD_FIND_MTD_CODE']=='1'] len(NTM_df) # In[351]: # Pick out it in order to get the asset, risk, intent, black IP out RISK_V2=NTM_df['RISK_V2'] RISK_V2_FILTERED=RISK_V2.dropna() print(RISK_V2.size) print(RISK_V2_FILTERED.size) # In[352]: def filter_assets_value(risk): risks=[] try: for risk_key in risk: if 'ASSETS_VAL_' in risk_key and risk[risk_key]: risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key) risks.append(risk_key_desc) except: print(risk) print(type(risk)) finally: return risks # In[353]: # modified def get_asset_desc(asset_field): if asset_field == 'ASSETS_VAL_1': return '공인-전체IP대역(유선)' elif asset_field == 'ASSETS_VAL_2': return '공인-전체IP대역(무선)' elif asset_field == 'ASSETS_VAL_3': return '공인-WEB서버' elif asset_field == 'ASSETS_VAL_4': return '공인-내부응용서버' elif asset_field == 'ASSETS_VAL_5': return '공인-DB서버' elif asset_field == 'ASSETS_VAL_6': return '공인-패치서버' elif asset_field == 'ASSETS_VAL_7': return '공인-네트워크' elif asset_field == 'ASSETS_VAL_8': return '공인-보안' elif asset_field == 'ASSETS_VAL_9': return '공인-업무용PC' elif asset_field == 'ASSETS_VAL_10': return '공인-비업무용PC' elif asset_field == 'ASSETS_VAL_11': return '공인-기타' elif asset_field == 'ASSETS_VAL_12': return '사설-전체IP대역(유선)' elif asset_field == 'ASSETS_VAL_13': return '사설-전체IP대역(무선)' elif asset_field == 'ASSETS_VAL_14': return '사설-WEB서버' elif asset_field == 'ASSETS_VAL_15': return '사설-내부응용서버' elif asset_field == 'ASSETS_VAL_16': return '사설-DB서버' elif asset_field == 'ASSETS_VAL_17': return '사설-패치서버' elif asset_field == 'ASSETS_VAL_18': return '사설-네트워크' elif asset_field == 'ASSETS_VAL_19': return '사설-보안' elif asset_field == 'ASSETS_VAL_20': return '사설-업무용PC' elif asset_field == 'ASSETS_VAL_21': return '사설-비업무용PC' elif asset_field == 'ASSETS_VAL_22': return '사설-기타' else: return '' # In[354]: # New assets column NTM_df['ASSETS_VAL']=list(map(filter_assets_value, RISK_V2_FILTERED)) NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].astype(str) NTM_df[:1] # In[355]: # modified def filter_intent(intent): intents=[] for intent_key in intent: if 'INTENT_VAL_' in intent_key and intent[intent_key]: intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key) intents.append(intent_key_desc) return intents # In[356]: def get_intent_desc(intent_field): if intent_field == 'INTENT_VAL_1': return '파괴' elif intent_field == 'INTENT_VAL_2': return '유출' elif intent_field == 'INTENT_VAL_3': return '지연' elif intent_field == 'INTENT_VAL_4': return '잠복' elif intent_field == 'INTENT_VAL_5': return '단순침입' elif intent_field == 'INTENT_VAL_6': return 'MD5' elif intent_field == 'INTENT_VAL_0': return 'Default' else: return '' # In[357]: # New column of intent value NTM_df['INTENT_VAL']=list(map(filter_intent, RISK_V2_FILTERED)) NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].astype(str) NTM_df[:1] # In[358]: # modified def filter_source(source): sources=[] for source_key in source: if 'SOURCE_VAL_' in source_key and source[source_key]: source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key) sources.append(source_key_desc) return sources # In[359]: def get_source_desc(source_field): if source_field=='SOURCE_VAL_1': return '북한IP' if source_field=='SOURCE_VAL_3': return 'ECSC Black IP' else: return '' # In[360]: # New column of SOURCE_VAL value NTM_df['SOURCE_VAL']=list(map(filter_source, RISK_V2_FILTERED)) NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str) NTM_df[:5] # In[361]: NTM_df.drop(columns=['RISK_V2'], inplace=True) NTM_df.columns # In[362]: ##################### 여기서부터 진행하시면 됩니다. ##################### ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기##################### # It should be 13 columns in total # 1. 기관 INST_NM # 2. 공격 DRULE_ATT_TYPE_CODE1 # 3. 자산 ASSETS_VAL # 4. 위협공격ip TW_ATT_IP # 5. 위협공격port TW_ATT_PORT # 6. 위협피해ip TW_DMG_IP # 7. 위협피해port TW_DMG_PORT # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM # 9. 공격국가 TW_ATT_CT_NM # 10. 의도(7개) INTENT_VAL # 11. IP/URL 가중치 SOURCE_VAL # 12. 장비 ACCD_FIND_MTD_CODE # 13. 탐지규칙명 DRULE_NM # # In[363]: NTM_df.isna().sum() # In[364]: # Change the Nan to zero NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'') NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'') NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'') NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0) NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0) NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0) NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0) NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'') NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0) NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0) NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0) NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'') # In[365]: # Check NaN out again NTM_df.isna().sum() # In[366]: # # Merge all # # Make one string from all of elements NTM_df['Combined']=NTM_df['INST_NM'].astype(str)+' '+NTM_df['TW_ATT_IP'].astype(str) +' '+NTM_df['TW_ATT_PORT'].astype(str)+' '+NTM_df['TW_DMG_IP'].astype(str)+' ' +NTM_df['TW_DMG_PORT'].astype(str) +' '+NTM_df['ACCD_DMG_PROTO_NM'].astype(str) +' '+NTM_df['TW_ATT_CT_NM']+' '+NTM_df['ASSETS_VAL']+' '+NTM_df['INTENT_VAL']+' ' +NTM_df['SOURCE_VAL']+' '+NTM_df['DRULE_ATT_TYPE_CODE1']+' '+NTM_df['DRULE_NM'] NTM_com=NTM_df['Combined'] NTM_com[:10] # In[367]: # Change the type to DataFrame NTM_to_df=pd.DataFrame(NTM_com) NTM_to_df[:5] # In[368]: # Change the type to list in order to apply the algorithm(nested list) NTM_tolist=NTM_to_df.values.tolist() NTM_tolist[:5] # In[369]: from prefixspan import PrefixSpan # In[370]: # Apply prefixspan PrefixSpan_NTM = PrefixSpan(NTM_tolist) ###### Interchangeable ###### # Get any over frequency 1 prefix_NTM=PrefixSpan_NTM.frequent(1) prefix_NTM[:3] # In[371]: # Put the result to DataFrame prefix_NTM_df=pd.DataFrame(prefix_NTM) prefix_NTM_df[:5] # In[372]: # Change the columns name prefix_NTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True) # Make the new column for filling the Effect prefix_NTM_df['Effect']=np.nan # Change the order of columns prefix_NTM_df=prefix_NTM_df[['Cause','Effect','Frequency']] prefix_NTM_df[:2] # In[373]: # Define the function that find the rule name def generate_cause(cell): drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB'] for drule in drules: if ' '+drule in cell[0]: return drule return '' # Mapping the rule name with cause that is the effect effect=list(map(generate_cause, prefix_NTM_df.Cause)) # Assign the rule name as an effect prefix_NTM_df['Effect']=effect prefix_NTM_df.sort_values(by=['Frequency'],ascending=False) # In[374]: # Attack Filter def Attack_filter(ps): return ' Attack' in ps[0] att_filter=prefix_NTM_df[list(map(Attack_filter, prefix_NTM_df.Cause))].fillna('Attack') # Malwr Filter def Malwr_filter(ps): return ' Malwr' in ps[0] mal_filter=prefix_NTM_df[list(map(Malwr_filter, prefix_NTM_df.Cause))].fillna('Malwr') # DDOS Filter def DDOS_filter(ps): return ' DDOS' in ps[0] dd_filter=prefix_NTM_df[list(map(DDOS_filter, prefix_NTM_df.Cause))].fillna('DDOS') # HACK Filter def HACK_filter(ps): return ' HACK' in ps[0] hack_filter=prefix_NTM_df[list(map(HACK_filter, prefix_NTM_df.Cause))].fillna('HACK') # MAIL Filter def MAIL_filter(ps): return ' MAIL' in ps[0] mail_filter=prefix_NTM_df[list(map(MAIL_filter, prefix_NTM_df.Cause))].fillna('MAIL') # WEB Filter def WEB_filter(ps): return ' WEB' in ps[0] prefix_NTM_df web_filter=prefix_NTM_df[list(map(WEB_filter, prefix_NTM_df.Cause))].fillna('WEB') frames = [att_filter, mal_filter, dd_filter, hack_filter, mail_filter, web_filter] result = pd.concat(frames) result.sort_values(by=['Frequency'],ascending=False) # In[ ]: ##################### NTM section End ##################### # In[ ]: ##################### MTM section ##################### # In[375]: MTM_df=df[df['ACCD_FIND_MTD_CODE']=='2'] len(MTM_df) # In[376]: # Pick out it in order to get the asset, risk, intent, black IP out RISK_V2_MTM=MTM_df['RISK_V2'] RISK_V2_FILTERED_MTM=RISK_V2_MTM.dropna() print(RISK_V2_MTM.size) print(RISK_V2_FILTERED_MTM.size) # In[377]: def filter_assets_value_MTM(risk): risks=[] try: for risk_key in risk: if 'ASSETS_VAL_' in risk_key and risk[risk_key]: risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key) risks.append(risk_key_desc) except: print(risk) print(type(risk)) finally: return risks # In[378]: # modified def get_asset_desc_MTM(asset_field): if asset_field == 'ASSETS_VAL_1': return '공인-전체IP대역(유선)' elif asset_field == 'ASSETS_VAL_2': return '공인-전체IP대역(무선)' elif asset_field == 'ASSETS_VAL_3': return '공인-WEB서버' elif asset_field == 'ASSETS_VAL_4': return '공인-내부응용서버' elif asset_field == 'ASSETS_VAL_5': return '공인-DB서버' elif asset_field == 'ASSETS_VAL_6': return '공인-패치서버' elif asset_field == 'ASSETS_VAL_7': return '공인-네트워크' elif asset_field == 'ASSETS_VAL_8': return '공인-보안' elif asset_field == 'ASSETS_VAL_9': return '공인-업무용PC' elif asset_field == 'ASSETS_VAL_10': return '공인-비업무용PC' elif asset_field == 'ASSETS_VAL_11': return '공인-기타' elif asset_field == 'ASSETS_VAL_12': return '사설-전체IP대역(유선)' elif asset_field == 'ASSETS_VAL_13': return '사설-전체IP대역(무선)' elif asset_field == 'ASSETS_VAL_14': return '사설-WEB서버' elif asset_field == 'ASSETS_VAL_15': return '사설-내부응용서버' elif asset_field == 'ASSETS_VAL_16': return '사설-DB서버' elif asset_field == 'ASSETS_VAL_17': return '사설-패치서버' elif asset_field == 'ASSETS_VAL_18': return '사설-네트워크' elif asset_field == 'ASSETS_VAL_19': return '사설-보안' elif asset_field == 'ASSETS_VAL_20': return '사설-업무용PC' elif asset_field == 'ASSETS_VAL_21': return '사설-비업무용PC' elif asset_field == 'ASSETS_VAL_22': return '사설-기타' else: return '' # In[379]: # New assets column MTM_df['ASSETS_VAL']=list(map(filter_assets_value_MTM, RISK_V2_FILTERED_MTM)) MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].astype(str) MTM_df[:1] # In[381]: # modified def filter_intent_MTM(intent): intents=[] for intent_key in intent: if 'INTENT_VAL_' in intent_key and intent[intent_key]: intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key) intents.append(intent_key_desc) return intents # In[382]: def get_intent_desc_MTM(intent_field): if intent_field == 'INTENT_VAL_1': return '파괴' elif intent_field == 'INTENT_VAL_2': return '유출' elif intent_field == 'INTENT_VAL_3': return '지연' elif intent_field == 'INTENT_VAL_4': return '잠복' elif intent_field == 'INTENT_VAL_5': return '단순침입' elif intent_field == 'INTENT_VAL_6': return 'MD5' elif intent_field == 'INTENT_VAL_0': return 'Default' else: return '' # In[383]: # New column of intent value MTM_df['INTENT_VAL']=list(map(filter_intent_MTM, RISK_V2_FILTERED_MTM)) MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].astype(str) MTM_df[:1] # In[384]: # modified def filter_source_MTM(source): sources=[] for source_key in source: if 'SOURCE_VAL_' in source_key and source[source_key]: source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key) sources.append(source_key_desc) return sources # In[385]: def get_source_desc_MTM(source_field): if source_field=='SOURCE_VAL_1': return '북한IP' if source_field=='SOURCE_VAL_3': return 'ECSC Black IP' else: return '' # In[386]: # New column of SOURCE_VAL value MTM_df['SOURCE_VAL']=list(map(filter_source_MTM, RISK_V2_FILTERED_MTM)) MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].astype(str) MTM_df[:5] # In[387]: MTM_df.drop(columns=['RISK_V2'], inplace=True) MTM_df.columns # In[388]: MTM_df.isna().sum() # In[389]: # Change the Nan to zero MTM_df['ACCD_DMG_PROTO_NM']=MTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'') MTM_df['INST_NM']=MTM_df['INST_NM'].replace(np.nan,'') MTM_df['DRULE_ATT_TYPE_CODE1']=MTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'') MTM_df['TW_ATT_IP']=MTM_df['TW_ATT_IP'].replace(np.nan,0) MTM_df['TW_ATT_PORT']=MTM_df['TW_ATT_PORT'].replace(np.nan,0) MTM_df['TW_DMG_IP']=MTM_df['TW_DMG_IP'].replace(np.nan,0) MTM_df['TW_DMG_PORT']=MTM_df['TW_DMG_PORT'].replace(np.nan,0) MTM_df['TW_ATT_CT_NM']=MTM_df['TW_ATT_CT_NM'].replace(np.nan,'') MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].replace(np.nan,0) MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].replace(np.nan,0) MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].replace(np.nan,0) MTM_df['DRULE_NM']=MTM_df['DRULE_NM'].replace(np.nan,'') # In[390]: # Check NaN out again MTM_df.isna().sum() # In[391]: # # Merge all # # Make one string from all of elements MTM_df['Combined']=MTM_df['INST_NM'].astype(str)+' '+MTM_df['TW_ATT_IP'].astype(str)+' '+MTM_df['TW_ATT_PORT'].astype(str)+' '+MTM_df['TW_DMG_IP'].astype(str)+' '+MTM_df['TW_DMG_PORT'].astype(str) +' '+MTM_df['ACCD_DMG_PROTO_NM'].astype(str)+' '+MTM_df['TW_ATT_CT_NM']+' '+MTM_df['ASSETS_VAL']+' '+MTM_df['INTENT_VAL']+' '+MTM_df['SOURCE_VAL']+' '+MTM_df['DRULE_ATT_TYPE_CODE1']+' '+MTM_df['DRULE_NM'] MTM_com=MTM_df['Combined'] MTM_com[:10] # In[392]: # Change the type to DataFrame MTM_to_df=pd.DataFrame(MTM_com) MTM_to_df[:5] # In[393]: # Change the type to list in order to apply the algorithm(nested list) MTM_tolist=MTM_to_df.values.tolist() MTM_tolist[:5] # In[394]: # Apply prefixspan PrefixSpan_MTM = PrefixSpan(MTM_tolist) ###### Interchangeable ###### # Get any over frequency 1 prefix_MTM=PrefixSpan_MTM.frequent(1) prefix_MTM[:3] # In[395]: # Put the result to DataFrame prefix_MTM_df=pd.DataFrame(prefix_MTM) prefix_MTM_df[:5] # In[396]: # Change the columns name prefix_MTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True) # Make the new column for filling the Effect prefix_MTM_df['Effect']=np.nan # Change the order of columns prefix_MTM_df=prefix_MTM_df[['Cause','Effect','Frequency']] prefix_MTM_df[:2] # In[397]: # Define the function that find the rule name def generate_cause_MTM(cell): drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB'] for drule in drules: if ' '+drule in cell[0]: return drule return '' # Mapping the rule name with cause that is the effect effect_MTM=list(map(generate_cause, prefix_MTM_df.Cause)) # Assign the rule name as an effect prefix_MTM_df['Effect']=effect_MTM prefix_MTM_df.sort_values(by=['Frequency'],ascending=False) # In[399]: # Attack Filter def Attack_filter_MTM(ps): return ' Attack' in ps[0] att_filter_MTM=prefix_MTM_df[list(map(Attack_filter_MTM, prefix_MTM_df.Cause))].fillna('Attack') # Malwr Filter def Malwr_filter_MTM(ps): return ' Malwr' in ps[0] mal_filter_MTM=prefix_MTM_df[list(map(Malwr_filter_MTM, prefix_MTM_df.Cause))].fillna('Malwr') # DDOS Filter def DDOS_filter_MTM(ps): return ' DDOS' in ps[0] dd_filter_MTM=prefix_MTM_df[list(map(DDOS_filter_MTM, prefix_MTM_df.Cause))].fillna('DDOS') # HACK Filter def HACK_filter_MTM(ps): return ' HACK' in ps[0] hack_filter_MTM=prefix_MTM_df[list(map(HACK_filter_MTM, prefix_MTM_df.Cause))].fillna('HACK') # MAIL Filter def MAIL_filter_MTM(ps): return ' MAIL' in ps[0] mail_filter_MTM=prefix_MTM_df[list(map(MAIL_filter_MTM, prefix_MTM_df.Cause))].fillna('MAIL') # WEB Filter def WEB_filter_MTM(ps): return ' WEB' in ps[0] prefix_MTM_df[:5] web_filter_MTM=prefix_MTM_df[list(map(WEB_filter_MTM, prefix_MTM_df.Cause))].fillna('WEB') frames_MTM = [att_filter_MTM, mal_filter_MTM, dd_filter_MTM, hack_filter_MTM, mail_filter_MTM, web_filter_MTM] result_MTM = pd.concat(frames_MTM) result_MTM.sort_values(by=['Frequency'],ascending=False) # In[ ]: