파이썬 기반의 Prefix span 분석_fork
Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

PrefixSpan_edit_20210925.py 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # <p>NTM(유해트래픽 탐지장비)</p>
  4. # <p>MTM(악성파일 탐지장비)</p>
  5. # In[1]:
  6. #!/usr/bin/env python
  7. # coding: utf-8
  8. import pandas as pd
  9. import numpy as np
  10. from mlxtend.preprocessing import TransactionEncoder
  11. from mlxtend.frequent_patterns import association_rules, fpgrowth
  12. from prefixspan import PrefixSpan
  13. # load ts_data_accident-2020_sample.csv
  14. # to prevent dtypewarning, set low_memory=False
  15. df = pd.read_csv('ts_data_accident-2020_sample.csv', low_memory=False)
  16. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()
  17. len(df) #len(df) : 10000, load successful
  18. df.head()
  19. # In[2]:
  20. ##################### NTM section #####################
  21. NTM_df=df[df['ACCD_FIND_MTD_CODE']==1] #* edit'1' to 1
  22. len(NTM_df)
  23. #*NTM_df.head()
  24. # In[3]:
  25. # Pick out it in order to get the asset, risk, intent, black IP out
  26. RISK_V2=NTM_df['RISK_V2']
  27. RISK_V2_FILTERED=RISK_V2.dropna()
  28. print(RISK_V2.size)
  29. print(RISK_V2_FILTERED.size)
  30. #* 추가 : 기존 filter_assets_value 사용시 값을 인식하지 못하는 문제 발생 -> RISK_V2를 별도의 df로 수정
  31. import json
  32. from pandas import json_normalize
  33. risk_df = pd.DataFrame()
  34. for newVal in RISK_V2_FILTERED:
  35. newVal = newVal.replace("'", "\"")
  36. newVal_str = json.loads(newVal)
  37. newVal_df = json_normalize(newVal_str)
  38. risk_df = pd.concat([risk_df,newVal_df],ignore_index=True)
  39. risk_df_col = risk_df.columns.values.tolist()
  40. # In[4]:
  41. # In[352]:
  42. asset_val = []
  43. intent_val=[]
  44. source_val=[]
  45. def filter_assets_value(risk):
  46. for i in range(len(risk)):
  47. risks=[]
  48. intents=[]
  49. sources=[]
  50. try:
  51. for key in risk_df_col:
  52. if 'ASSETS_VAL_' in key and risk.iloc[i][key]:
  53. risk_key_desc = 'RISK_V2.' + key + '=' + get_asset_desc(key)
  54. risks.append(risk_key_desc)
  55. if 'INTENT_VAL_' in key and risk.iloc[i][key]:
  56. intent_key_desc = 'RISK_V2.' + key + '=' + get_intent_desc(key)
  57. intents.append(intent_key_desc)
  58. if 'SOURCE_VAL_' in key and risk.iloc[i][key]:
  59. source_key_desc='RISK_V2.' + key + '=' + get_source_desc(key)
  60. sources.append(source_key_desc)
  61. except:
  62. print(risk)
  63. print(type(risk))
  64. finally:
  65. asset_val.append(risks)
  66. intent_val.append(intents)
  67. source_val.append(sources)
  68. # modified
  69. def get_asset_desc(asset_field):
  70. if asset_field == 'ASSETS_VAL_1':
  71. return '공인-전체IP대역(유선)'
  72. elif asset_field == 'ASSETS_VAL_2':
  73. return '공인-전체IP대역(무선)'
  74. elif asset_field == 'ASSETS_VAL_3':
  75. return '공인-WEB서버'
  76. elif asset_field == 'ASSETS_VAL_4':
  77. return '공인-내부응용서버'
  78. elif asset_field == 'ASSETS_VAL_5':
  79. return '공인-DB서버'
  80. elif asset_field == 'ASSETS_VAL_6':
  81. return '공인-패치서버'
  82. elif asset_field == 'ASSETS_VAL_7':
  83. return '공인-네트워크'
  84. elif asset_field == 'ASSETS_VAL_8':
  85. return '공인-보안'
  86. elif asset_field == 'ASSETS_VAL_9':
  87. return '공인-업무용PC'
  88. elif asset_field == 'ASSETS_VAL_10':
  89. return '공인-비업무용PC'
  90. elif asset_field == 'ASSETS_VAL_11':
  91. return '공인-기타'
  92. elif asset_field == 'ASSETS_VAL_12':
  93. return '사설-전체IP대역(유선)'
  94. elif asset_field == 'ASSETS_VAL_13':
  95. return '사설-전체IP대역(무선)'
  96. elif asset_field == 'ASSETS_VAL_14':
  97. return '사설-WEB서버'
  98. elif asset_field == 'ASSETS_VAL_15':
  99. return '사설-내부응용서버'
  100. elif asset_field == 'ASSETS_VAL_16':
  101. return '사설-DB서버'
  102. elif asset_field == 'ASSETS_VAL_17':
  103. return '사설-패치서버'
  104. elif asset_field == 'ASSETS_VAL_18':
  105. return '사설-네트워크'
  106. elif asset_field == 'ASSETS_VAL_19':
  107. return '사설-보안'
  108. elif asset_field == 'ASSETS_VAL_20':
  109. return '사설-업무용PC'
  110. elif asset_field == 'ASSETS_VAL_21':
  111. return '사설-비업무용PC'
  112. elif asset_field == 'ASSETS_VAL_22':
  113. return '사설-기타'
  114. else:
  115. return ''
  116. # modified
  117. def filter_intent(intent):
  118. intents=[]
  119. for intent_key in intent:
  120. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  121. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  122. intents.append(intent_key_desc)
  123. return intents
  124. # In[356]:
  125. def get_intent_desc(intent_field):
  126. if intent_field == 'INTENT_VAL_1':
  127. return '파괴'
  128. elif intent_field == 'INTENT_VAL_2':
  129. return '유출'
  130. elif intent_field == 'INTENT_VAL_3':
  131. return '지연'
  132. elif intent_field == 'INTENT_VAL_4':
  133. return '잠복'
  134. elif intent_field == 'INTENT_VAL_5':
  135. return '단순침입'
  136. elif intent_field == 'INTENT_VAL_6':
  137. return 'MD5'
  138. elif intent_field == 'INTENT_VAL_0':
  139. return 'Default'
  140. else:
  141. return ''
  142. # In[358]:
  143. # modified
  144. def filter_source(source):
  145. sources=[]
  146. for source_key in source:
  147. if 'SOURCE_VAL_' in source_key and source[source_key]:
  148. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  149. sources.append(source_key_desc)
  150. return sources
  151. # In[359]:
  152. def get_source_desc(source_field):
  153. if source_field=='SOURCE_VAL_1':
  154. return '북한IP'
  155. if source_field=='SOURCE_VAL_3':
  156. return 'ECSC Black IP'
  157. else:
  158. return ''
  159. # In[5]:
  160. filter_assets_value(risk_df)
  161. #뒤에 isna()를 통해 na값을 0으로 바꿔주는 작업을 하므로, 값이 비어있는 경우 [] 대신 비워두기
  162. # New assets column
  163. NTM_df['ASSETS_VAL']= asset_val
  164. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].astype(str)
  165. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].str.replace('[','', regex=False)
  166. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].str.replace(']','', regex=False)
  167. NTM_df[:1]
  168. # New column of intent value
  169. NTM_df['INTENT_VAL']=intent_val
  170. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].astype(str)
  171. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].str.replace('[','',regex=False)
  172. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].str.replace(']','',regex=False)
  173. NTM_df[:1]
  174. # New column of SOURCE_VAL value
  175. NTM_df['SOURCE_VAL']=source_val
  176. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str)
  177. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace('[','',regex=False)
  178. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace(']','',regex=False)
  179. NTM_df[:5]
  180. # In[ ]:
  181. # In[6]:
  182. # In[361]:
  183. NTM_df.drop(columns=['RISK_V2'], inplace=True)
  184. NTM_df.columns
  185. # In[362]:
  186. #NTM_df
  187. # In[ ]:
  188. # In[7]:
  189. ##################### 여기서부터 진행하시면 됩니다. #####################
  190. ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################
  191. # It should be 13 columns in total
  192. # 1. 기관 INST_NM
  193. # 2. 공격 DRULE_ATT_TYPE_CODE1
  194. # 3. 자산 ASSETS_VAL
  195. # 4. 위협공격ip TW_ATT_IP
  196. # 5. 위협공격port TW_ATT_PORT
  197. # 6. 위협피해ip TW_DMG_IP
  198. # 7. 위협피해port TW_DMG_PORT
  199. # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM
  200. # 9. 공격국가 TW_ATT_CT_NM
  201. # 10. 의도(7개) INTENT_VAL
  202. # 11. IP/URL 가중치 SOURCE_VAL
  203. # 12. 장비 ACCD_FIND_MTD_CODE
  204. # 13. 탐지규칙명 DRULE_NM
  205. # In[363]:
  206. NTM_df.isna().sum()
  207. # Change the Nan to zero
  208. NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  209. NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')
  210. NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  211. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)
  212. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)
  213. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)
  214. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)
  215. NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  216. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)
  217. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)
  218. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)
  219. NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')
  220. # Check NaN out again
  221. NTM_df.isna().sum()
  222. # In[366]:
  223. # # Merge all
  224. # # Make one string from all of elements
  225. NTM_df['Combined']=NTM_df['INST_NM'].astype(str)+' '+NTM_df['TW_ATT_IP'].astype(str)+' '+NTM_df['TW_ATT_PORT'].astype(str)+' '+NTM_df['TW_DMG_IP'].astype(str)+' '+NTM_df['TW_DMG_PORT'].astype(str) +' '+NTM_df['ACCD_DMG_PROTO_NM'].astype(str)+' '+NTM_df['TW_ATT_CT_NM']+' '+NTM_df['ASSETS_VAL']+' '+NTM_df['INTENT_VAL']+' '+NTM_df['SOURCE_VAL']+' '+NTM_df['DRULE_ATT_TYPE_CODE1']+' '+NTM_df['DRULE_NM']
  226. NTM_com=NTM_df['Combined']
  227. NTM_com[:10]
  228. # 수정하여 merge한 부분
  229. NTM_new_com= []
  230. for i in range(0,len(NTM_df)):
  231. temp_list = []
  232. temp_list.append([NTM_df['INST_NM'][i],NTM_df['TW_ATT_IP'][i],NTM_df['TW_ATT_PORT'][i], NTM_df['TW_DMG_IP'][i],
  233. NTM_df['TW_DMG_PORT'][i], NTM_df['ACCD_DMG_PROTO_NM'][i], NTM_df['TW_ATT_CT_NM'][i], NTM_df['ASSETS_VAL'].loc[i],
  234. NTM_df['INTENT_VAL'].loc[i], NTM_df['SOURCE_VAL'].loc[i], NTM_df['DRULE_ATT_TYPE_CODE1'][i], NTM_df['DRULE_NM'][i]])
  235. NTM_new_com.extend(temp_list)
  236. # Change the type to DataFrame
  237. NTM_new_to_df=pd.DataFrame(NTM_new_com)
  238. NTM_new_to_df[:5]
  239. NTM_new_to_df.head()
  240. # In[8]:
  241. # Edit
  242. NTM_new_tolist=NTM_new_to_df.values.tolist()
  243. NTM_new_tolist[:2]
  244. # In[9]:
  245. from prefixspan import PrefixSpan
  246. # In[370]:
  247. # Apply prefixspan
  248. PrefixSpan_NTM = PrefixSpan(NTM_new_tolist)
  249. prefix_NTM=PrefixSpan_NTM.frequent(1)
  250. prefix_NTM_df=pd.DataFrame(prefix_NTM)
  251. prefix_NTM_df[:5]
  252. # In[17]:
  253. # Change the columns name
  254. prefix_NTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  255. # Make the new column for filling the Effect
  256. prefix_NTM_df['Effect']=np.nan
  257. # Change the order of columns
  258. prefix_NTM_df=prefix_NTM_df[['Cause','Effect','Frequency']]
  259. # 모든 가능한 조합에 대한 시나리오 Frequency 큰 값부터 정렬
  260. prefix_NTM_df= prefix_NTM_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True)
  261. # In[ ]:
  262. # In[373]:
  263. # Define the function that find the rule name
  264. def generate_cause(cell):
  265. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  266. for i in range(len(prefix_NTM_df)):
  267. for drule in drules:
  268. temp_drule = cell.iloc[i]['Cause']
  269. if drule in temp_drule:
  270. prefix_NTM_df.iloc[i]['Effect'] = drule
  271. generate_cause(prefix_NTM_df)
  272. # Assign the rule name as an effect
  273. prefix_NTM_df.sort_values(by=['Frequency'],ascending=False)
  274. # In[ ]:
  275. # In[374]:
  276. # Attack Filter
  277. def Attack_filter(ps):
  278. return ' Attack' in ps[0]
  279. att_filter=prefix_NTM_df[list(map(Attack_filter, prefix_NTM_df.Cause))].fillna('Attack')
  280. # Malwr Filter
  281. def Malwr_filter(ps):
  282. return ' Malwr' in ps[0]
  283. mal_filter=prefix_NTM_df[list(map(Malwr_filter, prefix_NTM_df.Cause))].fillna('Malwr')
  284. # DDOS Filter
  285. def DDOS_filter(ps):
  286. return ' DDOS' in ps[0]
  287. dd_filter=prefix_NTM_df[list(map(DDOS_filter, prefix_NTM_df.Cause))].fillna('DDOS')
  288. # HACK Filter
  289. def HACK_filter(ps):
  290. return ' HACK' in ps[0]
  291. hack_filter=prefix_NTM_df[list(map(HACK_filter, prefix_NTM_df.Cause))].fillna('HACK')
  292. # MAIL Filter
  293. def MAIL_filter(ps):
  294. return ' MAIL' in ps[0]
  295. mail_filter=prefix_NTM_df[list(map(MAIL_filter, prefix_NTM_df.Cause))].fillna('MAIL')
  296. # WEB Filter
  297. def WEB_filter(ps):
  298. return ' WEB' in ps[0]
  299. prefix_NTM_df
  300. web_filter=prefix_NTM_df[list(map(WEB_filter, prefix_NTM_df.Cause))].fillna('WEB')
  301. frames = [att_filter, mal_filter, dd_filter, hack_filter, mail_filter, web_filter]
  302. result = pd.concat(frames)
  303. result.sort_values(by=['Frequency'],ascending=False)
  304. # In[ ]:
  305. ##################### NTM section End #####################
  306. # In[ ]:
  307. ##################### MTM section #####################
  308. # In[375]:
  309. MTM_df=df[df['ACCD_FIND_MTD_CODE']==2]
  310. len(MTM_df)
  311. # In[376]:
  312. # Pick out it in order to get the asset, risk, intent, black IP out
  313. RISK_V2_MTM=MTM_df['RISK_V2']
  314. RISK_V2_FILTERED_MTM=RISK_V2_MTM.dropna()
  315. print(RISK_V2_MTM.size)
  316. print(RISK_V2_FILTERED_MTM.size)
  317. # In[377]:
  318. def filter_assets_value_MTM(risk):
  319. risks=[]
  320. try:
  321. for risk_key in risk:
  322. if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  323. risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key)
  324. risks.append(risk_key_desc)
  325. except:
  326. print(risk)
  327. print(type(risk))
  328. finally:
  329. return risks
  330. # In[378]:
  331. # modified
  332. def get_asset_desc_MTM(asset_field):
  333. if asset_field == 'ASSETS_VAL_1':
  334. return '공인-전체IP대역(유선)'
  335. elif asset_field == 'ASSETS_VAL_2':
  336. return '공인-전체IP대역(무선)'
  337. elif asset_field == 'ASSETS_VAL_3':
  338. return '공인-WEB서버'
  339. elif asset_field == 'ASSETS_VAL_4':
  340. return '공인-내부응용서버'
  341. elif asset_field == 'ASSETS_VAL_5':
  342. return '공인-DB서버'
  343. elif asset_field == 'ASSETS_VAL_6':
  344. return '공인-패치서버'
  345. elif asset_field == 'ASSETS_VAL_7':
  346. return '공인-네트워크'
  347. elif asset_field == 'ASSETS_VAL_8':
  348. return '공인-보안'
  349. elif asset_field == 'ASSETS_VAL_9':
  350. return '공인-업무용PC'
  351. elif asset_field == 'ASSETS_VAL_10':
  352. return '공인-비업무용PC'
  353. elif asset_field == 'ASSETS_VAL_11':
  354. return '공인-기타'
  355. elif asset_field == 'ASSETS_VAL_12':
  356. return '사설-전체IP대역(유선)'
  357. elif asset_field == 'ASSETS_VAL_13':
  358. return '사설-전체IP대역(무선)'
  359. elif asset_field == 'ASSETS_VAL_14':
  360. return '사설-WEB서버'
  361. elif asset_field == 'ASSETS_VAL_15':
  362. return '사설-내부응용서버'
  363. elif asset_field == 'ASSETS_VAL_16':
  364. return '사설-DB서버'
  365. elif asset_field == 'ASSETS_VAL_17':
  366. return '사설-패치서버'
  367. elif asset_field == 'ASSETS_VAL_18':
  368. return '사설-네트워크'
  369. elif asset_field == 'ASSETS_VAL_19':
  370. return '사설-보안'
  371. elif asset_field == 'ASSETS_VAL_20':
  372. return '사설-업무용PC'
  373. elif asset_field == 'ASSETS_VAL_21':
  374. return '사설-비업무용PC'
  375. elif asset_field == 'ASSETS_VAL_22':
  376. return '사설-기타'
  377. else:
  378. return ''
  379. # In[379]:
  380. # New assets column
  381. MTM_df['ASSETS_VAL']=list(map(filter_assets_value_MTM, RISK_V2_FILTERED_MTM))
  382. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].astype(str)
  383. MTM_df[:1]
  384. # In[381]:
  385. # modified
  386. def filter_intent_MTM(intent):
  387. intents=[]
  388. for intent_key in intent:
  389. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  390. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  391. intents.append(intent_key_desc)
  392. return intents
  393. # In[382]:
  394. def get_intent_desc_MTM(intent_field):
  395. if intent_field == 'INTENT_VAL_1':
  396. return '파괴'
  397. elif intent_field == 'INTENT_VAL_2':
  398. return '유출'
  399. elif intent_field == 'INTENT_VAL_3':
  400. return '지연'
  401. elif intent_field == 'INTENT_VAL_4':
  402. return '잠복'
  403. elif intent_field == 'INTENT_VAL_5':
  404. return '단순침입'
  405. elif intent_field == 'INTENT_VAL_6':
  406. return 'MD5'
  407. elif intent_field == 'INTENT_VAL_0':
  408. return 'Default'
  409. else:
  410. return ''
  411. # In[383]:
  412. # New column of intent value
  413. MTM_df['INTENT_VAL']=list(map(filter_intent_MTM, RISK_V2_FILTERED_MTM))
  414. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].astype(str)
  415. MTM_df[:1]
  416. # In[384]:
  417. # modified
  418. def filter_source_MTM(source):
  419. sources=[]
  420. for source_key in source:
  421. if 'SOURCE_VAL_' in source_key and source[source_key]:
  422. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  423. sources.append(source_key_desc)
  424. return sources
  425. # In[385]:
  426. def get_source_desc_MTM(source_field):
  427. if source_field=='SOURCE_VAL_1':
  428. return '북한IP'
  429. if source_field=='SOURCE_VAL_3':
  430. return 'ECSC Black IP'
  431. else:
  432. return ''
  433. # In[386]:
  434. # New column of SOURCE_VAL value
  435. MTM_df['SOURCE_VAL']=list(map(filter_source_MTM, RISK_V2_FILTERED_MTM))
  436. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].astype(str)
  437. MTM_df[:5]
  438. # In[387]:
  439. MTM_df.drop(columns=['RISK_V2'], inplace=True)
  440. MTM_df.columns
  441. # In[388]:
  442. MTM_df.isna().sum()
  443. # In[389]:
  444. # Change the Nan to zero
  445. MTM_df['ACCD_DMG_PROTO_NM']=MTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  446. MTM_df['INST_NM']=MTM_df['INST_NM'].replace(np.nan,'')
  447. MTM_df['DRULE_ATT_TYPE_CODE1']=MTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  448. MTM_df['TW_ATT_IP']=MTM_df['TW_ATT_IP'].replace(np.nan,0)
  449. MTM_df['TW_ATT_PORT']=MTM_df['TW_ATT_PORT'].replace(np.nan,0)
  450. MTM_df['TW_DMG_IP']=MTM_df['TW_DMG_IP'].replace(np.nan,0)
  451. MTM_df['TW_DMG_PORT']=MTM_df['TW_DMG_PORT'].replace(np.nan,0)
  452. MTM_df['TW_ATT_CT_NM']=MTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  453. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].replace(np.nan,0)
  454. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].replace(np.nan,0)
  455. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].replace(np.nan,0)
  456. MTM_df['DRULE_NM']=MTM_df['DRULE_NM'].replace(np.nan,'')
  457. # In[390]:
  458. # Check NaN out again
  459. MTM_df.isna().sum()
  460. # In[391]:
  461. # # Merge all
  462. # # Make one string from all of elements
  463. MTM_df['Combined']=MTM_df['INST_NM'].astype(str)+' '+MTM_df['TW_ATT_IP'].astype(str)+' '+MTM_df['TW_ATT_PORT'].astype(str)+' '+MTM_df['TW_DMG_IP'].astype(str)+' '+MTM_df['TW_DMG_PORT'].astype(str) +' '+MTM_df['ACCD_DMG_PROTO_NM'].astype(str)+' '+MTM_df['TW_ATT_CT_NM']+' '+MTM_df['ASSETS_VAL']+' '+MTM_df['INTENT_VAL']+' '+MTM_df['SOURCE_VAL']+' '+MTM_df['DRULE_ATT_TYPE_CODE1']+' '+MTM_df['DRULE_NM']
  464. MTM_com=MTM_df['Combined']
  465. MTM_com[:10]
  466. # In[392]:
  467. # Change the type to DataFrame
  468. MTM_to_df=pd.DataFrame(MTM_com)
  469. MTM_to_df[:5]
  470. # In[393]:
  471. # Change the type to list in order to apply the algorithm(nested list)
  472. MTM_tolist=MTM_to_df.values.tolist()
  473. MTM_tolist[:5]
  474. # In[394]:
  475. # Apply prefixspan
  476. PrefixSpan_MTM = PrefixSpan(MTM_tolist)
  477. ###### Interchangeable ######
  478. # Get any over frequency 1
  479. prefix_MTM=PrefixSpan_MTM.frequent(1)
  480. prefix_MTM[:3]
  481. # In[395]:
  482. # Put the result to DataFrame
  483. prefix_MTM_df=pd.DataFrame(prefix_MTM)
  484. prefix_MTM_df[:5]
  485. # In[396]:
  486. # Change the columns name
  487. prefix_MTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  488. # Make the new column for filling the Effect
  489. prefix_MTM_df['Effect']=np.nan
  490. # Change the order of columns
  491. prefix_MTM_df=prefix_MTM_df[['Cause','Effect','Frequency']]
  492. prefix_MTM_df[:2]
  493. # In[397]:
  494. # Define the function that find the rule name
  495. def generate_cause_MTM(cell):
  496. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  497. for drule in drules:
  498. if ' '+drule in cell[0]:
  499. return drule
  500. return ''
  501. # Mapping the rule name with cause that is the effect
  502. effect_MTM=list(map(generate_cause, prefix_MTM_df.Cause))
  503. # Assign the rule name as an effect
  504. prefix_MTM_df['Effect']=effect_MTM
  505. prefix_MTM_df.sort_values(by=['Frequency'],ascending=False)
  506. # In[399]:
  507. # Attack Filter
  508. def Attack_filter_MTM(ps):
  509. return ' Attack' in ps[0]
  510. att_filter_MTM=prefix_MTM_df[list(map(Attack_filter_MTM, prefix_MTM_df.Cause))].fillna('Attack')
  511. # Malwr Filter
  512. def Malwr_filter_MTM(ps):
  513. return ' Malwr' in ps[0]
  514. mal_filter_MTM=prefix_MTM_df[list(map(Malwr_filter_MTM, prefix_MTM_df.Cause))].fillna('Malwr')
  515. # DDOS Filter
  516. def DDOS_filter_MTM(ps):
  517. return ' DDOS' in ps[0]
  518. dd_filter_MTM=prefix_MTM_df[list(map(DDOS_filter_MTM, prefix_MTM_df.Cause))].fillna('DDOS')
  519. # HACK Filter
  520. def HACK_filter_MTM(ps):
  521. return ' HACK' in ps[0]
  522. hack_filter_MTM=prefix_MTM_df[list(map(HACK_filter_MTM, prefix_MTM_df.Cause))].fillna('HACK')
  523. # MAIL Filter
  524. def MAIL_filter_MTM(ps):
  525. return ' MAIL' in ps[0]
  526. mail_filter_MTM=prefix_MTM_df[list(map(MAIL_filter_MTM, prefix_MTM_df.Cause))].fillna('MAIL')
  527. # WEB Filter
  528. def WEB_filter_MTM(ps):
  529. return ' WEB' in ps[0]
  530. prefix_MTM_df[:5]
  531. web_filter_MTM=prefix_MTM_df[list(map(WEB_filter_MTM, prefix_MTM_df.Cause))].fillna('WEB')
  532. frames_MTM = [att_filter_MTM, mal_filter_MTM, dd_filter_MTM, hack_filter_MTM, mail_filter_MTM, web_filter_MTM]
  533. result_MTM = pd.concat(frames_MTM)
  534. result_MTM.sort_values(by=['Frequency'],ascending=False)
  535. # In[ ]:
  536. # In[ ]: