#!/usr/bin/env python # coding: utf-8 #

NTM(유해트래픽 탐지장비)

#

MTM(악성파일 탐지장비)

# In[1]: #!/usr/bin/env python # coding: utf-8 import pandas as pd import numpy as np from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import association_rules, fpgrowth from prefixspan import PrefixSpan # load ts_data_accident-2020_sample.csv # to prevent dtypewarning, set low_memory=False df = pd.read_csv('ts_data_accident-2020_sample.csv', low_memory=False) df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna() len(df) #len(df) : 10000, load successful df.head() # In[2]: ##################### NTM section ##################### NTM_df=df[df['ACCD_FIND_MTD_CODE']==1] #* edit'1' to 1 len(NTM_df) #*NTM_df.head() # In[3]: # Pick out it in order to get the asset, risk, intent, black IP out RISK_V2=NTM_df['RISK_V2'] RISK_V2_FILTERED=RISK_V2.dropna() print(RISK_V2.size) print(RISK_V2_FILTERED.size) #* 추가 : 기존 filter_assets_value 사용시 값을 인식하지 못하는 문제 발생 -> RISK_V2를 별도의 df로 수정 import json from pandas import json_normalize risk_df = pd.DataFrame() for newVal in RISK_V2_FILTERED: newVal = newVal.replace("'", "\"") newVal_str = json.loads(newVal) newVal_df = json_normalize(newVal_str) risk_df = pd.concat([risk_df,newVal_df],ignore_index=True) risk_df_col = risk_df.columns.values.tolist() # In[4]: # In[352]: asset_val = [] intent_val=[] source_val=[] def filter_assets_value(risk): for i in range(len(risk)): risks=[] intents=[] sources=[] try: for key in risk_df_col: if 'ASSETS_VAL_' in key and risk.iloc[i][key]: risk_key_desc = 'RISK_V2.' + key + '=' + get_asset_desc(key) risks.append(risk_key_desc) if 'INTENT_VAL_' in key and risk.iloc[i][key]: intent_key_desc = 'RISK_V2.' + key + '=' + get_intent_desc(key) intents.append(intent_key_desc) if 'SOURCE_VAL_' in key and risk.iloc[i][key]: source_key_desc='RISK_V2.' + key + '=' + get_source_desc(key) sources.append(source_key_desc) except: print(risk) print(type(risk)) finally: asset_val.append(risks) intent_val.append(intents) source_val.append(sources) # modified def get_asset_desc(asset_field): if asset_field == 'ASSETS_VAL_1': return '공인-전체IP대역(유선)' elif asset_field == 'ASSETS_VAL_2': return '공인-전체IP대역(무선)' elif asset_field == 'ASSETS_VAL_3': return '공인-WEB서버' elif asset_field == 'ASSETS_VAL_4': return '공인-내부응용서버' elif asset_field == 'ASSETS_VAL_5': return '공인-DB서버' elif asset_field == 'ASSETS_VAL_6': return '공인-패치서버' elif asset_field == 'ASSETS_VAL_7': return '공인-네트워크' elif asset_field == 'ASSETS_VAL_8': return '공인-보안' elif asset_field == 'ASSETS_VAL_9': return '공인-업무용PC' elif asset_field == 'ASSETS_VAL_10': return '공인-비업무용PC' elif asset_field == 'ASSETS_VAL_11': return '공인-기타' elif asset_field == 'ASSETS_VAL_12': return '사설-전체IP대역(유선)' elif asset_field == 'ASSETS_VAL_13': return '사설-전체IP대역(무선)' elif asset_field == 'ASSETS_VAL_14': return '사설-WEB서버' elif asset_field == 'ASSETS_VAL_15': return '사설-내부응용서버' elif asset_field == 'ASSETS_VAL_16': return '사설-DB서버' elif asset_field == 'ASSETS_VAL_17': return '사설-패치서버' elif asset_field == 'ASSETS_VAL_18': return '사설-네트워크' elif asset_field == 'ASSETS_VAL_19': return '사설-보안' elif asset_field == 'ASSETS_VAL_20': return '사설-업무용PC' elif asset_field == 'ASSETS_VAL_21': return '사설-비업무용PC' elif asset_field == 'ASSETS_VAL_22': return '사설-기타' else: return '' # modified def filter_intent(intent): intents=[] for intent_key in intent: if 'INTENT_VAL_' in intent_key and intent[intent_key]: intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key) intents.append(intent_key_desc) return intents # In[356]: def get_intent_desc(intent_field): if intent_field == 'INTENT_VAL_1': return '파괴' elif intent_field == 'INTENT_VAL_2': return '유출' elif intent_field == 'INTENT_VAL_3': return '지연' elif intent_field == 'INTENT_VAL_4': return '잠복' elif intent_field == 'INTENT_VAL_5': return '단순침입' elif intent_field == 'INTENT_VAL_6': return 'MD5' elif intent_field == 'INTENT_VAL_0': return 'Default' else: return '' # In[358]: # modified def filter_source(source): sources=[] for source_key in source: if 'SOURCE_VAL_' in source_key and source[source_key]: source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key) sources.append(source_key_desc) return sources # In[359]: def get_source_desc(source_field): if source_field=='SOURCE_VAL_1': return '북한IP' if source_field=='SOURCE_VAL_3': return 'ECSC Black IP' else: return '' # In[5]: filter_assets_value(risk_df) #뒤에 isna()를 통해 na값을 0으로 바꿔주는 작업을 하므로, 값이 비어있는 경우 [] 대신 비워두기 # New assets column NTM_df['ASSETS_VAL']= asset_val NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].astype(str) NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].str.replace('[','', regex=False) NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].str.replace(']','', regex=False) NTM_df[:1] # New column of intent value NTM_df['INTENT_VAL']=intent_val NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].astype(str) NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].str.replace('[','',regex=False) NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].str.replace(']','',regex=False) NTM_df[:1] # New column of SOURCE_VAL value NTM_df['SOURCE_VAL']=source_val NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str) NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace('[','',regex=False) NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace(']','',regex=False) NTM_df[:5] # In[ ]: # In[6]: # In[361]: NTM_df.drop(columns=['RISK_V2'], inplace=True) NTM_df.columns # In[362]: #NTM_df # In[ ]: # In[7]: ##################### 여기서부터 진행하시면 됩니다. ##################### ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기##################### # It should be 13 columns in total # 1. 기관 INST_NM # 2. 공격 DRULE_ATT_TYPE_CODE1 # 3. 자산 ASSETS_VAL # 4. 위협공격ip TW_ATT_IP # 5. 위협공격port TW_ATT_PORT # 6. 위협피해ip TW_DMG_IP # 7. 위협피해port TW_DMG_PORT # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM # 9. 공격국가 TW_ATT_CT_NM # 10. 의도(7개) INTENT_VAL # 11. IP/URL 가중치 SOURCE_VAL # 12. 장비 ACCD_FIND_MTD_CODE # 13. 탐지규칙명 DRULE_NM # In[363]: NTM_df.isna().sum() # Change the Nan to zero NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'') NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'') NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'') NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0) NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0) NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0) NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0) NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'') NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0) NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0) NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0) NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'') # Check NaN out again NTM_df.isna().sum() # In[366]: # # Merge all # # Make one string from all of elements NTM_df['Combined']=NTM_df['INST_NM'].astype(str)+' '+NTM_df['TW_ATT_IP'].astype(str)+' '+NTM_df['TW_ATT_PORT'].astype(str)+' '+NTM_df['TW_DMG_IP'].astype(str)+' '+NTM_df['TW_DMG_PORT'].astype(str) +' '+NTM_df['ACCD_DMG_PROTO_NM'].astype(str)+' '+NTM_df['TW_ATT_CT_NM']+' '+NTM_df['ASSETS_VAL']+' '+NTM_df['INTENT_VAL']+' '+NTM_df['SOURCE_VAL']+' '+NTM_df['DRULE_ATT_TYPE_CODE1']+' '+NTM_df['DRULE_NM'] NTM_com=NTM_df['Combined'] NTM_com[:10] # 수정하여 merge한 부분 NTM_new_com= [] for i in range(0,len(NTM_df)): temp_list = [] temp_list.append([NTM_df['INST_NM'][i],NTM_df['TW_ATT_IP'][i],NTM_df['TW_ATT_PORT'][i], NTM_df['TW_DMG_IP'][i], NTM_df['TW_DMG_PORT'][i], NTM_df['ACCD_DMG_PROTO_NM'][i], NTM_df['TW_ATT_CT_NM'][i], NTM_df['ASSETS_VAL'].loc[i], NTM_df['INTENT_VAL'].loc[i], NTM_df['SOURCE_VAL'].loc[i], NTM_df['DRULE_ATT_TYPE_CODE1'][i], NTM_df['DRULE_NM'][i]]) NTM_new_com.extend(temp_list) # Change the type to DataFrame NTM_new_to_df=pd.DataFrame(NTM_new_com) NTM_new_to_df[:5] NTM_new_to_df.head() # In[8]: # Edit NTM_new_tolist=NTM_new_to_df.values.tolist() NTM_new_tolist[:2] # In[9]: from prefixspan import PrefixSpan # In[370]: # Apply prefixspan PrefixSpan_NTM = PrefixSpan(NTM_new_tolist) prefix_NTM=PrefixSpan_NTM.frequent(1) prefix_NTM_df=pd.DataFrame(prefix_NTM) prefix_NTM_df[:5] # In[17]: # Change the columns name prefix_NTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True) # Make the new column for filling the Effect prefix_NTM_df['Effect']=np.nan # Change the order of columns prefix_NTM_df=prefix_NTM_df[['Cause','Effect','Frequency']] # 모든 가능한 조합에 대한 시나리오 Frequency 큰 값부터 정렬 prefix_NTM_df= prefix_NTM_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True) # In[ ]: # In[373]: # Define the function that find the rule name def generate_cause(cell): drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB'] for i in range(len(prefix_NTM_df)): for drule in drules: temp_drule = cell.iloc[i]['Cause'] if drule in temp_drule: prefix_NTM_df.iloc[i]['Effect'] = drule generate_cause(prefix_NTM_df) # Assign the rule name as an effect prefix_NTM_df.sort_values(by=['Frequency'],ascending=False) # In[ ]: # In[374]: # Attack Filter def Attack_filter(ps): return ' Attack' in ps[0] att_filter=prefix_NTM_df[list(map(Attack_filter, prefix_NTM_df.Cause))].fillna('Attack') # Malwr Filter def Malwr_filter(ps): return ' Malwr' in ps[0] mal_filter=prefix_NTM_df[list(map(Malwr_filter, prefix_NTM_df.Cause))].fillna('Malwr') # DDOS Filter def DDOS_filter(ps): return ' DDOS' in ps[0] dd_filter=prefix_NTM_df[list(map(DDOS_filter, prefix_NTM_df.Cause))].fillna('DDOS') # HACK Filter def HACK_filter(ps): return ' HACK' in ps[0] hack_filter=prefix_NTM_df[list(map(HACK_filter, prefix_NTM_df.Cause))].fillna('HACK') # MAIL Filter def MAIL_filter(ps): return ' MAIL' in ps[0] mail_filter=prefix_NTM_df[list(map(MAIL_filter, prefix_NTM_df.Cause))].fillna('MAIL') # WEB Filter def WEB_filter(ps): return ' WEB' in ps[0] prefix_NTM_df web_filter=prefix_NTM_df[list(map(WEB_filter, prefix_NTM_df.Cause))].fillna('WEB') frames = [att_filter, mal_filter, dd_filter, hack_filter, mail_filter, web_filter] result = pd.concat(frames) result.sort_values(by=['Frequency'],ascending=False) # In[ ]: ##################### NTM section End ##################### # In[ ]: ##################### MTM section ##################### # In[375]: MTM_df=df[df['ACCD_FIND_MTD_CODE']==2] len(MTM_df) # In[376]: # Pick out it in order to get the asset, risk, intent, black IP out RISK_V2_MTM=MTM_df['RISK_V2'] RISK_V2_FILTERED_MTM=RISK_V2_MTM.dropna() print(RISK_V2_MTM.size) print(RISK_V2_FILTERED_MTM.size) # In[377]: def filter_assets_value_MTM(risk): risks=[] try: for risk_key in risk: if 'ASSETS_VAL_' in risk_key and risk[risk_key]: risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key) risks.append(risk_key_desc) except: print(risk) print(type(risk)) finally: return risks # In[378]: # modified def get_asset_desc_MTM(asset_field): if asset_field == 'ASSETS_VAL_1': return '공인-전체IP대역(유선)' elif asset_field == 'ASSETS_VAL_2': return '공인-전체IP대역(무선)' elif asset_field == 'ASSETS_VAL_3': return '공인-WEB서버' elif asset_field == 'ASSETS_VAL_4': return '공인-내부응용서버' elif asset_field == 'ASSETS_VAL_5': return '공인-DB서버' elif asset_field == 'ASSETS_VAL_6': return '공인-패치서버' elif asset_field == 'ASSETS_VAL_7': return '공인-네트워크' elif asset_field == 'ASSETS_VAL_8': return '공인-보안' elif asset_field == 'ASSETS_VAL_9': return '공인-업무용PC' elif asset_field == 'ASSETS_VAL_10': return '공인-비업무용PC' elif asset_field == 'ASSETS_VAL_11': return '공인-기타' elif asset_field == 'ASSETS_VAL_12': return '사설-전체IP대역(유선)' elif asset_field == 'ASSETS_VAL_13': return '사설-전체IP대역(무선)' elif asset_field == 'ASSETS_VAL_14': return '사설-WEB서버' elif asset_field == 'ASSETS_VAL_15': return '사설-내부응용서버' elif asset_field == 'ASSETS_VAL_16': return '사설-DB서버' elif asset_field == 'ASSETS_VAL_17': return '사설-패치서버' elif asset_field == 'ASSETS_VAL_18': return '사설-네트워크' elif asset_field == 'ASSETS_VAL_19': return '사설-보안' elif asset_field == 'ASSETS_VAL_20': return '사설-업무용PC' elif asset_field == 'ASSETS_VAL_21': return '사설-비업무용PC' elif asset_field == 'ASSETS_VAL_22': return '사설-기타' else: return '' # In[379]: # New assets column MTM_df['ASSETS_VAL']=list(map(filter_assets_value_MTM, RISK_V2_FILTERED_MTM)) MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].astype(str) MTM_df[:1] # In[381]: # modified def filter_intent_MTM(intent): intents=[] for intent_key in intent: if 'INTENT_VAL_' in intent_key and intent[intent_key]: intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key) intents.append(intent_key_desc) return intents # In[382]: def get_intent_desc_MTM(intent_field): if intent_field == 'INTENT_VAL_1': return '파괴' elif intent_field == 'INTENT_VAL_2': return '유출' elif intent_field == 'INTENT_VAL_3': return '지연' elif intent_field == 'INTENT_VAL_4': return '잠복' elif intent_field == 'INTENT_VAL_5': return '단순침입' elif intent_field == 'INTENT_VAL_6': return 'MD5' elif intent_field == 'INTENT_VAL_0': return 'Default' else: return '' # In[383]: # New column of intent value MTM_df['INTENT_VAL']=list(map(filter_intent_MTM, RISK_V2_FILTERED_MTM)) MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].astype(str) MTM_df[:1] # In[384]: # modified def filter_source_MTM(source): sources=[] for source_key in source: if 'SOURCE_VAL_' in source_key and source[source_key]: source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key) sources.append(source_key_desc) return sources # In[385]: def get_source_desc_MTM(source_field): if source_field=='SOURCE_VAL_1': return '북한IP' if source_field=='SOURCE_VAL_3': return 'ECSC Black IP' else: return '' # In[386]: # New column of SOURCE_VAL value MTM_df['SOURCE_VAL']=list(map(filter_source_MTM, RISK_V2_FILTERED_MTM)) MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].astype(str) MTM_df[:5] # In[387]: MTM_df.drop(columns=['RISK_V2'], inplace=True) MTM_df.columns # In[388]: MTM_df.isna().sum() # In[389]: # Change the Nan to zero MTM_df['ACCD_DMG_PROTO_NM']=MTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'') MTM_df['INST_NM']=MTM_df['INST_NM'].replace(np.nan,'') MTM_df['DRULE_ATT_TYPE_CODE1']=MTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'') MTM_df['TW_ATT_IP']=MTM_df['TW_ATT_IP'].replace(np.nan,0) MTM_df['TW_ATT_PORT']=MTM_df['TW_ATT_PORT'].replace(np.nan,0) MTM_df['TW_DMG_IP']=MTM_df['TW_DMG_IP'].replace(np.nan,0) MTM_df['TW_DMG_PORT']=MTM_df['TW_DMG_PORT'].replace(np.nan,0) MTM_df['TW_ATT_CT_NM']=MTM_df['TW_ATT_CT_NM'].replace(np.nan,'') MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].replace(np.nan,0) MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].replace(np.nan,0) MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].replace(np.nan,0) MTM_df['DRULE_NM']=MTM_df['DRULE_NM'].replace(np.nan,'') # In[390]: # Check NaN out again MTM_df.isna().sum() # In[391]: # # Merge all # # Make one string from all of elements MTM_df['Combined']=MTM_df['INST_NM'].astype(str)+' '+MTM_df['TW_ATT_IP'].astype(str)+' '+MTM_df['TW_ATT_PORT'].astype(str)+' '+MTM_df['TW_DMG_IP'].astype(str)+' '+MTM_df['TW_DMG_PORT'].astype(str) +' '+MTM_df['ACCD_DMG_PROTO_NM'].astype(str)+' '+MTM_df['TW_ATT_CT_NM']+' '+MTM_df['ASSETS_VAL']+' '+MTM_df['INTENT_VAL']+' '+MTM_df['SOURCE_VAL']+' '+MTM_df['DRULE_ATT_TYPE_CODE1']+' '+MTM_df['DRULE_NM'] MTM_com=MTM_df['Combined'] MTM_com[:10] # In[392]: # Change the type to DataFrame MTM_to_df=pd.DataFrame(MTM_com) MTM_to_df[:5] # In[393]: # Change the type to list in order to apply the algorithm(nested list) MTM_tolist=MTM_to_df.values.tolist() MTM_tolist[:5] # In[394]: # Apply prefixspan PrefixSpan_MTM = PrefixSpan(MTM_tolist) ###### Interchangeable ###### # Get any over frequency 1 prefix_MTM=PrefixSpan_MTM.frequent(1) prefix_MTM[:3] # In[395]: # Put the result to DataFrame prefix_MTM_df=pd.DataFrame(prefix_MTM) prefix_MTM_df[:5] # In[396]: # Change the columns name prefix_MTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True) # Make the new column for filling the Effect prefix_MTM_df['Effect']=np.nan # Change the order of columns prefix_MTM_df=prefix_MTM_df[['Cause','Effect','Frequency']] prefix_MTM_df[:2] # In[397]: # Define the function that find the rule name def generate_cause_MTM(cell): drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB'] for drule in drules: if ' '+drule in cell[0]: return drule return '' # Mapping the rule name with cause that is the effect effect_MTM=list(map(generate_cause, prefix_MTM_df.Cause)) # Assign the rule name as an effect prefix_MTM_df['Effect']=effect_MTM prefix_MTM_df.sort_values(by=['Frequency'],ascending=False) # In[399]: # Attack Filter def Attack_filter_MTM(ps): return ' Attack' in ps[0] att_filter_MTM=prefix_MTM_df[list(map(Attack_filter_MTM, prefix_MTM_df.Cause))].fillna('Attack') # Malwr Filter def Malwr_filter_MTM(ps): return ' Malwr' in ps[0] mal_filter_MTM=prefix_MTM_df[list(map(Malwr_filter_MTM, prefix_MTM_df.Cause))].fillna('Malwr') # DDOS Filter def DDOS_filter_MTM(ps): return ' DDOS' in ps[0] dd_filter_MTM=prefix_MTM_df[list(map(DDOS_filter_MTM, prefix_MTM_df.Cause))].fillna('DDOS') # HACK Filter def HACK_filter_MTM(ps): return ' HACK' in ps[0] hack_filter_MTM=prefix_MTM_df[list(map(HACK_filter_MTM, prefix_MTM_df.Cause))].fillna('HACK') # MAIL Filter def MAIL_filter_MTM(ps): return ' MAIL' in ps[0] mail_filter_MTM=prefix_MTM_df[list(map(MAIL_filter_MTM, prefix_MTM_df.Cause))].fillna('MAIL') # WEB Filter def WEB_filter_MTM(ps): return ' WEB' in ps[0] prefix_MTM_df[:5] web_filter_MTM=prefix_MTM_df[list(map(WEB_filter_MTM, prefix_MTM_df.Cause))].fillna('WEB') frames_MTM = [att_filter_MTM, mal_filter_MTM, dd_filter_MTM, hack_filter_MTM, mail_filter_MTM, web_filter_MTM] result_MTM = pd.concat(frames_MTM) result_MTM.sort_values(by=['Frequency'],ascending=False) # In[ ]: # In[ ]: