파이썬 기반의 Prefix span 분석_fork
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

PrefixSpan_20210925.py 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # In[1]:
  4. import os
  5. import array
  6. import math
  7. import pickle
  8. # import joblib
  9. import sys
  10. import argparse
  11. import pandas as pd
  12. import numpy as np
  13. import matplotlib.pyplot as plt
  14. from datetime import datetime
  15. from pprint import pprint
  16. import ssl
  17. from elasticsearch.connection import create_ssl_context
  18. from elasticsearch import Elasticsearch
  19. from elasticsearch import helpers
  20. import urllib3
  21. # In[3]:
  22. import pandas as pd
  23. import numpy as np
  24. from mlxtend.preprocessing import TransactionEncoder
  25. from mlxtend.frequent_patterns import association_rules, fpgrowth
  26. from prefixspan import PrefixSpan
  27. # In[4]:
  28. ssl_context = create_ssl_context()
  29. ssl_context.check_hostname = False
  30. ssl_context.verify_mode = ssl.CERT_NONE
  31. # In[12]:
  32. es = Elasticsearch(hosts=[{'host': '223.194.92.152', 'port': 9200}], scheme="http",verify_certs=False, timeout=300, ssl_context=ssl_context, http_auth=("elasticsearch", "hadoop2019@!@#$"))
  33. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  34. # In[347]:
  35. ######## 2020, 1 year ########
  36. ######## There are no MTM data in 2018, 2019 ########
  37. body = {
  38. "size" : 10000,
  39. "query": {
  40. "range":{
  41. "TW_COLLECT_DT":{
  42. "gte":"2020-01-01T00:00:00.625+09:00",
  43. "lte":"2020-12-31T00:00:00.625+09:00" ################
  44. }
  45. }
  46. },
  47. "sort":[{
  48. "_id":"asc"
  49. }]
  50. }
  51. res = es.search(index = 'ts_data_accident-2020', body=body)
  52. data = res['hits']['hits']
  53. nxt=res["hit"]["hit"][-1]["sort"][0]
  54. total = res['hits']['total']
  55. # print(total)
  56. accident = []
  57. for da in data:
  58. att_type = da['_source']
  59. # att_type["POL_NM"]=att_type["SCEN_INFOS"][0]["POL_NM"]
  60. accident.append(att_type)
  61. # df = pd.DataFrame(accident,dtype=str)
  62. df_10000 = pd.DataFrame(accident)
  63. print(df_10000.head())
  64. # In[ ]:
  65. ######## 2020, 1 year ########
  66. ######## There are no MTM data in 2018, 2019 ########
  67. body = {
  68. "size" : 10000,
  69. "search_after":[nxt],
  70. "query": {
  71. "range":{
  72. "TW_COLLECT_DT":{
  73. "gte":"2020-01-01T00:00:00.625+09:00",
  74. "lte":"2020-12-31T00:00:00.625+09:00" ################
  75. }
  76. }
  77. },
  78. "sort":[{
  79. "_id":"asc"
  80. }]
  81. }
  82. res = es.search(index = 'ts_data_accident-2020', body=body)
  83. data = res['hits']['hits']
  84. nxt=res["hit"]["hit"][-1]["sort"][0]
  85. total = res['hits']['total']
  86. # print(total)
  87. accident = []
  88. for da in data:
  89. att_type = da['_source']
  90. # att_type["POL_NM"]=att_type["SCEN_INFOS"][0]["POL_NM"]
  91. accident.append(att_type)
  92. # df = pd.DataFrame(accident,dtype=str)
  93. df_20000 = pd.DataFrame(accident)
  94. print(df_20000.head())
  95. # In[348]:
  96. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()
  97. len(df)
  98. df.head()
  99. # In[349]:
  100. ##################### NTM section #####################
  101. # In[350]:
  102. NTM_df=df[df['ACCD_FIND_MTD_CODE']=='1']
  103. len(NTM_df)
  104. # In[351]:
  105. # Pick out it in order to get the asset, risk, intent, black IP out
  106. RISK_V2=NTM_df['RISK_V2']
  107. RISK_V2_FILTERED=RISK_V2.dropna()
  108. print(RISK_V2.size)
  109. print(RISK_V2_FILTERED.size)
  110. # In[352]:
  111. def filter_assets_value(risk):
  112. risks=[]
  113. try:
  114. for risk_key in risk:
  115. if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  116. risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key)
  117. risks.append(risk_key_desc)
  118. except:
  119. print(risk)
  120. print(type(risk))
  121. finally:
  122. return risks
  123. # In[353]:
  124. # modified
  125. def get_asset_desc(asset_field):
  126. if asset_field == 'ASSETS_VAL_1':
  127. return '공인-전체IP대역(유선)'
  128. elif asset_field == 'ASSETS_VAL_2':
  129. return '공인-전체IP대역(무선)'
  130. elif asset_field == 'ASSETS_VAL_3':
  131. return '공인-WEB서버'
  132. elif asset_field == 'ASSETS_VAL_4':
  133. return '공인-내부응용서버'
  134. elif asset_field == 'ASSETS_VAL_5':
  135. return '공인-DB서버'
  136. elif asset_field == 'ASSETS_VAL_6':
  137. return '공인-패치서버'
  138. elif asset_field == 'ASSETS_VAL_7':
  139. return '공인-네트워크'
  140. elif asset_field == 'ASSETS_VAL_8':
  141. return '공인-보안'
  142. elif asset_field == 'ASSETS_VAL_9':
  143. return '공인-업무용PC'
  144. elif asset_field == 'ASSETS_VAL_10':
  145. return '공인-비업무용PC'
  146. elif asset_field == 'ASSETS_VAL_11':
  147. return '공인-기타'
  148. elif asset_field == 'ASSETS_VAL_12':
  149. return '사설-전체IP대역(유선)'
  150. elif asset_field == 'ASSETS_VAL_13':
  151. return '사설-전체IP대역(무선)'
  152. elif asset_field == 'ASSETS_VAL_14':
  153. return '사설-WEB서버'
  154. elif asset_field == 'ASSETS_VAL_15':
  155. return '사설-내부응용서버'
  156. elif asset_field == 'ASSETS_VAL_16':
  157. return '사설-DB서버'
  158. elif asset_field == 'ASSETS_VAL_17':
  159. return '사설-패치서버'
  160. elif asset_field == 'ASSETS_VAL_18':
  161. return '사설-네트워크'
  162. elif asset_field == 'ASSETS_VAL_19':
  163. return '사설-보안'
  164. elif asset_field == 'ASSETS_VAL_20':
  165. return '사설-업무용PC'
  166. elif asset_field == 'ASSETS_VAL_21':
  167. return '사설-비업무용PC'
  168. elif asset_field == 'ASSETS_VAL_22':
  169. return '사설-기타'
  170. else:
  171. return ''
  172. # In[354]:
  173. # New assets column
  174. NTM_df['ASSETS_VAL']=list(map(filter_assets_value, RISK_V2_FILTERED))
  175. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].astype(str)
  176. NTM_df[:1]
  177. # In[355]:
  178. # modified
  179. def filter_intent(intent):
  180. intents=[]
  181. for intent_key in intent:
  182. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  183. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  184. intents.append(intent_key_desc)
  185. return intents
  186. # In[356]:
  187. def get_intent_desc(intent_field):
  188. if intent_field == 'INTENT_VAL_1':
  189. return '파괴'
  190. elif intent_field == 'INTENT_VAL_2':
  191. return '유출'
  192. elif intent_field == 'INTENT_VAL_3':
  193. return '지연'
  194. elif intent_field == 'INTENT_VAL_4':
  195. return '잠복'
  196. elif intent_field == 'INTENT_VAL_5':
  197. return '단순침입'
  198. elif intent_field == 'INTENT_VAL_6':
  199. return 'MD5'
  200. elif intent_field == 'INTENT_VAL_0':
  201. return 'Default'
  202. else:
  203. return ''
  204. # In[357]:
  205. # New column of intent value
  206. NTM_df['INTENT_VAL']=list(map(filter_intent, RISK_V2_FILTERED))
  207. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].astype(str)
  208. NTM_df[:1]
  209. # In[358]:
  210. # modified
  211. def filter_source(source):
  212. sources=[]
  213. for source_key in source:
  214. if 'SOURCE_VAL_' in source_key and source[source_key]:
  215. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  216. sources.append(source_key_desc)
  217. return sources
  218. # In[359]:
  219. def get_source_desc(source_field):
  220. if source_field=='SOURCE_VAL_1':
  221. return '북한IP'
  222. if source_field=='SOURCE_VAL_3':
  223. return 'ECSC Black IP'
  224. else:
  225. return ''
  226. # In[360]:
  227. # New column of SOURCE_VAL value
  228. NTM_df['SOURCE_VAL']=list(map(filter_source, RISK_V2_FILTERED))
  229. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str)
  230. NTM_df[:5]
  231. # In[361]:
  232. NTM_df.drop(columns=['RISK_V2'], inplace=True)
  233. NTM_df.columns
  234. # In[362]:
  235. ##################### 여기서부터 진행하시면 됩니다. #####################
  236. ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################
  237. # It should be 13 columns in total
  238. # 1. 기관 INST_NM
  239. # 2. 공격 DRULE_ATT_TYPE_CODE1
  240. # 3. 자산 ASSETS_VAL
  241. # 4. 위협공격ip TW_ATT_IP
  242. # 5. 위협공격port TW_ATT_PORT
  243. # 6. 위협피해ip TW_DMG_IP
  244. # 7. 위협피해port TW_DMG_PORT
  245. # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM
  246. # 9. 공격국가 TW_ATT_CT_NM
  247. # 10. 의도(7개) INTENT_VAL
  248. # 11. IP/URL 가중치 SOURCE_VAL
  249. # 12. 장비 ACCD_FIND_MTD_CODE
  250. # 13. 탐지규칙명 DRULE_NM
  251. #
  252. # In[363]:
  253. NTM_df.isna().sum()
  254. # In[364]:
  255. # Change the Nan to zero
  256. NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  257. NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')
  258. NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  259. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)
  260. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)
  261. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)
  262. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)
  263. NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  264. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)
  265. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)
  266. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)
  267. NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')
  268. # In[365]:
  269. # Check NaN out again
  270. NTM_df.isna().sum()
  271. # In[366]:
  272. # # Merge all
  273. # # Make one string from all of elements
  274. NTM_df['Combined']=NTM_df['INST_NM'].astype(str)+' '+NTM_df['TW_ATT_IP'].astype(str)
  275. +' '+NTM_df['TW_ATT_PORT'].astype(str)+' '+NTM_df['TW_DMG_IP'].astype(str)+' '
  276. +NTM_df['TW_DMG_PORT'].astype(str) +' '+NTM_df['ACCD_DMG_PROTO_NM'].astype(str)
  277. +' '+NTM_df['TW_ATT_CT_NM']+' '+NTM_df['ASSETS_VAL']+' '+NTM_df['INTENT_VAL']+' '
  278. +NTM_df['SOURCE_VAL']+' '+NTM_df['DRULE_ATT_TYPE_CODE1']+' '+NTM_df['DRULE_NM']
  279. NTM_com=NTM_df['Combined']
  280. NTM_com[:10]
  281. # In[367]:
  282. # Change the type to DataFrame
  283. NTM_to_df=pd.DataFrame(NTM_com)
  284. NTM_to_df[:5]
  285. # In[368]:
  286. # Change the type to list in order to apply the algorithm(nested list)
  287. NTM_tolist=NTM_to_df.values.tolist()
  288. NTM_tolist[:5]
  289. # In[369]:
  290. from prefixspan import PrefixSpan
  291. # In[370]:
  292. # Apply prefixspan
  293. PrefixSpan_NTM = PrefixSpan(NTM_tolist)
  294. ###### Interchangeable ######
  295. # Get any over frequency 1
  296. prefix_NTM=PrefixSpan_NTM.frequent(1)
  297. prefix_NTM[:3]
  298. # In[371]:
  299. # Put the result to DataFrame
  300. prefix_NTM_df=pd.DataFrame(prefix_NTM)
  301. prefix_NTM_df[:5]
  302. # In[372]:
  303. # Change the columns name
  304. prefix_NTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  305. # Make the new column for filling the Effect
  306. prefix_NTM_df['Effect']=np.nan
  307. # Change the order of columns
  308. prefix_NTM_df=prefix_NTM_df[['Cause','Effect','Frequency']]
  309. prefix_NTM_df[:2]
  310. # In[373]:
  311. # Define the function that find the rule name
  312. def generate_cause(cell):
  313. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  314. for drule in drules:
  315. if ' '+drule in cell[0]:
  316. return drule
  317. return ''
  318. # Mapping the rule name with cause that is the effect
  319. effect=list(map(generate_cause, prefix_NTM_df.Cause))
  320. # Assign the rule name as an effect
  321. prefix_NTM_df['Effect']=effect
  322. prefix_NTM_df.sort_values(by=['Frequency'],ascending=False)
  323. # In[374]:
  324. # Attack Filter
  325. def Attack_filter(ps):
  326. return ' Attack' in ps[0]
  327. att_filter=prefix_NTM_df[list(map(Attack_filter, prefix_NTM_df.Cause))].fillna('Attack')
  328. # Malwr Filter
  329. def Malwr_filter(ps):
  330. return ' Malwr' in ps[0]
  331. mal_filter=prefix_NTM_df[list(map(Malwr_filter, prefix_NTM_df.Cause))].fillna('Malwr')
  332. # DDOS Filter
  333. def DDOS_filter(ps):
  334. return ' DDOS' in ps[0]
  335. dd_filter=prefix_NTM_df[list(map(DDOS_filter, prefix_NTM_df.Cause))].fillna('DDOS')
  336. # HACK Filter
  337. def HACK_filter(ps):
  338. return ' HACK' in ps[0]
  339. hack_filter=prefix_NTM_df[list(map(HACK_filter, prefix_NTM_df.Cause))].fillna('HACK')
  340. # MAIL Filter
  341. def MAIL_filter(ps):
  342. return ' MAIL' in ps[0]
  343. mail_filter=prefix_NTM_df[list(map(MAIL_filter, prefix_NTM_df.Cause))].fillna('MAIL')
  344. # WEB Filter
  345. def WEB_filter(ps):
  346. return ' WEB' in ps[0]
  347. prefix_NTM_df
  348. web_filter=prefix_NTM_df[list(map(WEB_filter, prefix_NTM_df.Cause))].fillna('WEB')
  349. frames = [att_filter, mal_filter, dd_filter, hack_filter, mail_filter, web_filter]
  350. result = pd.concat(frames)
  351. result.sort_values(by=['Frequency'],ascending=False)
  352. # In[ ]:
  353. ##################### NTM section End #####################
  354. # In[ ]:
  355. ##################### MTM section #####################
  356. # In[375]:
  357. MTM_df=df[df['ACCD_FIND_MTD_CODE']=='2']
  358. len(MTM_df)
  359. # In[376]:
  360. # Pick out it in order to get the asset, risk, intent, black IP out
  361. RISK_V2_MTM=MTM_df['RISK_V2']
  362. RISK_V2_FILTERED_MTM=RISK_V2_MTM.dropna()
  363. print(RISK_V2_MTM.size)
  364. print(RISK_V2_FILTERED_MTM.size)
  365. # In[377]:
  366. def filter_assets_value_MTM(risk):
  367. risks=[]
  368. try:
  369. for risk_key in risk:
  370. if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  371. risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key)
  372. risks.append(risk_key_desc)
  373. except:
  374. print(risk)
  375. print(type(risk))
  376. finally:
  377. return risks
  378. # In[378]:
  379. # modified
  380. def get_asset_desc_MTM(asset_field):
  381. if asset_field == 'ASSETS_VAL_1':
  382. return '공인-전체IP대역(유선)'
  383. elif asset_field == 'ASSETS_VAL_2':
  384. return '공인-전체IP대역(무선)'
  385. elif asset_field == 'ASSETS_VAL_3':
  386. return '공인-WEB서버'
  387. elif asset_field == 'ASSETS_VAL_4':
  388. return '공인-내부응용서버'
  389. elif asset_field == 'ASSETS_VAL_5':
  390. return '공인-DB서버'
  391. elif asset_field == 'ASSETS_VAL_6':
  392. return '공인-패치서버'
  393. elif asset_field == 'ASSETS_VAL_7':
  394. return '공인-네트워크'
  395. elif asset_field == 'ASSETS_VAL_8':
  396. return '공인-보안'
  397. elif asset_field == 'ASSETS_VAL_9':
  398. return '공인-업무용PC'
  399. elif asset_field == 'ASSETS_VAL_10':
  400. return '공인-비업무용PC'
  401. elif asset_field == 'ASSETS_VAL_11':
  402. return '공인-기타'
  403. elif asset_field == 'ASSETS_VAL_12':
  404. return '사설-전체IP대역(유선)'
  405. elif asset_field == 'ASSETS_VAL_13':
  406. return '사설-전체IP대역(무선)'
  407. elif asset_field == 'ASSETS_VAL_14':
  408. return '사설-WEB서버'
  409. elif asset_field == 'ASSETS_VAL_15':
  410. return '사설-내부응용서버'
  411. elif asset_field == 'ASSETS_VAL_16':
  412. return '사설-DB서버'
  413. elif asset_field == 'ASSETS_VAL_17':
  414. return '사설-패치서버'
  415. elif asset_field == 'ASSETS_VAL_18':
  416. return '사설-네트워크'
  417. elif asset_field == 'ASSETS_VAL_19':
  418. return '사설-보안'
  419. elif asset_field == 'ASSETS_VAL_20':
  420. return '사설-업무용PC'
  421. elif asset_field == 'ASSETS_VAL_21':
  422. return '사설-비업무용PC'
  423. elif asset_field == 'ASSETS_VAL_22':
  424. return '사설-기타'
  425. else:
  426. return ''
  427. # In[379]:
  428. # New assets column
  429. MTM_df['ASSETS_VAL']=list(map(filter_assets_value_MTM, RISK_V2_FILTERED_MTM))
  430. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].astype(str)
  431. MTM_df[:1]
  432. # In[381]:
  433. # modified
  434. def filter_intent_MTM(intent):
  435. intents=[]
  436. for intent_key in intent:
  437. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  438. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  439. intents.append(intent_key_desc)
  440. return intents
  441. # In[382]:
  442. def get_intent_desc_MTM(intent_field):
  443. if intent_field == 'INTENT_VAL_1':
  444. return '파괴'
  445. elif intent_field == 'INTENT_VAL_2':
  446. return '유출'
  447. elif intent_field == 'INTENT_VAL_3':
  448. return '지연'
  449. elif intent_field == 'INTENT_VAL_4':
  450. return '잠복'
  451. elif intent_field == 'INTENT_VAL_5':
  452. return '단순침입'
  453. elif intent_field == 'INTENT_VAL_6':
  454. return 'MD5'
  455. elif intent_field == 'INTENT_VAL_0':
  456. return 'Default'
  457. else:
  458. return ''
  459. # In[383]:
  460. # New column of intent value
  461. MTM_df['INTENT_VAL']=list(map(filter_intent_MTM, RISK_V2_FILTERED_MTM))
  462. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].astype(str)
  463. MTM_df[:1]
  464. # In[384]:
  465. # modified
  466. def filter_source_MTM(source):
  467. sources=[]
  468. for source_key in source:
  469. if 'SOURCE_VAL_' in source_key and source[source_key]:
  470. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  471. sources.append(source_key_desc)
  472. return sources
  473. # In[385]:
  474. def get_source_desc_MTM(source_field):
  475. if source_field=='SOURCE_VAL_1':
  476. return '북한IP'
  477. if source_field=='SOURCE_VAL_3':
  478. return 'ECSC Black IP'
  479. else:
  480. return ''
  481. # In[386]:
  482. # New column of SOURCE_VAL value
  483. MTM_df['SOURCE_VAL']=list(map(filter_source_MTM, RISK_V2_FILTERED_MTM))
  484. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].astype(str)
  485. MTM_df[:5]
  486. # In[387]:
  487. MTM_df.drop(columns=['RISK_V2'], inplace=True)
  488. MTM_df.columns
  489. # In[388]:
  490. MTM_df.isna().sum()
  491. # In[389]:
  492. # Change the Nan to zero
  493. MTM_df['ACCD_DMG_PROTO_NM']=MTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  494. MTM_df['INST_NM']=MTM_df['INST_NM'].replace(np.nan,'')
  495. MTM_df['DRULE_ATT_TYPE_CODE1']=MTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  496. MTM_df['TW_ATT_IP']=MTM_df['TW_ATT_IP'].replace(np.nan,0)
  497. MTM_df['TW_ATT_PORT']=MTM_df['TW_ATT_PORT'].replace(np.nan,0)
  498. MTM_df['TW_DMG_IP']=MTM_df['TW_DMG_IP'].replace(np.nan,0)
  499. MTM_df['TW_DMG_PORT']=MTM_df['TW_DMG_PORT'].replace(np.nan,0)
  500. MTM_df['TW_ATT_CT_NM']=MTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  501. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].replace(np.nan,0)
  502. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].replace(np.nan,0)
  503. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].replace(np.nan,0)
  504. MTM_df['DRULE_NM']=MTM_df['DRULE_NM'].replace(np.nan,'')
  505. # In[390]:
  506. # Check NaN out again
  507. MTM_df.isna().sum()
  508. # In[391]:
  509. # # Merge all
  510. # # Make one string from all of elements
  511. MTM_df['Combined']=MTM_df['INST_NM'].astype(str)+' '+MTM_df['TW_ATT_IP'].astype(str)+' '+MTM_df['TW_ATT_PORT'].astype(str)+' '+MTM_df['TW_DMG_IP'].astype(str)+' '+MTM_df['TW_DMG_PORT'].astype(str) +' '+MTM_df['ACCD_DMG_PROTO_NM'].astype(str)+' '+MTM_df['TW_ATT_CT_NM']+' '+MTM_df['ASSETS_VAL']+' '+MTM_df['INTENT_VAL']+' '+MTM_df['SOURCE_VAL']+' '+MTM_df['DRULE_ATT_TYPE_CODE1']+' '+MTM_df['DRULE_NM']
  512. MTM_com=MTM_df['Combined']
  513. MTM_com[:10]
  514. # In[392]:
  515. # Change the type to DataFrame
  516. MTM_to_df=pd.DataFrame(MTM_com)
  517. MTM_to_df[:5]
  518. # In[393]:
  519. # Change the type to list in order to apply the algorithm(nested list)
  520. MTM_tolist=MTM_to_df.values.tolist()
  521. MTM_tolist[:5]
  522. # In[394]:
  523. # Apply prefixspan
  524. PrefixSpan_MTM = PrefixSpan(MTM_tolist)
  525. ###### Interchangeable ######
  526. # Get any over frequency 1
  527. prefix_MTM=PrefixSpan_MTM.frequent(1)
  528. prefix_MTM[:3]
  529. # In[395]:
  530. # Put the result to DataFrame
  531. prefix_MTM_df=pd.DataFrame(prefix_MTM)
  532. prefix_MTM_df[:5]
  533. # In[396]:
  534. # Change the columns name
  535. prefix_MTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  536. # Make the new column for filling the Effect
  537. prefix_MTM_df['Effect']=np.nan
  538. # Change the order of columns
  539. prefix_MTM_df=prefix_MTM_df[['Cause','Effect','Frequency']]
  540. prefix_MTM_df[:2]
  541. # In[397]:
  542. # Define the function that find the rule name
  543. def generate_cause_MTM(cell):
  544. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  545. for drule in drules:
  546. if ' '+drule in cell[0]:
  547. return drule
  548. return ''
  549. # Mapping the rule name with cause that is the effect
  550. effect_MTM=list(map(generate_cause, prefix_MTM_df.Cause))
  551. # Assign the rule name as an effect
  552. prefix_MTM_df['Effect']=effect_MTM
  553. prefix_MTM_df.sort_values(by=['Frequency'],ascending=False)
  554. # In[399]:
  555. # Attack Filter
  556. def Attack_filter_MTM(ps):
  557. return ' Attack' in ps[0]
  558. att_filter_MTM=prefix_MTM_df[list(map(Attack_filter_MTM, prefix_MTM_df.Cause))].fillna('Attack')
  559. # Malwr Filter
  560. def Malwr_filter_MTM(ps):
  561. return ' Malwr' in ps[0]
  562. mal_filter_MTM=prefix_MTM_df[list(map(Malwr_filter_MTM, prefix_MTM_df.Cause))].fillna('Malwr')
  563. # DDOS Filter
  564. def DDOS_filter_MTM(ps):
  565. return ' DDOS' in ps[0]
  566. dd_filter_MTM=prefix_MTM_df[list(map(DDOS_filter_MTM, prefix_MTM_df.Cause))].fillna('DDOS')
  567. # HACK Filter
  568. def HACK_filter_MTM(ps):
  569. return ' HACK' in ps[0]
  570. hack_filter_MTM=prefix_MTM_df[list(map(HACK_filter_MTM, prefix_MTM_df.Cause))].fillna('HACK')
  571. # MAIL Filter
  572. def MAIL_filter_MTM(ps):
  573. return ' MAIL' in ps[0]
  574. mail_filter_MTM=prefix_MTM_df[list(map(MAIL_filter_MTM, prefix_MTM_df.Cause))].fillna('MAIL')
  575. # WEB Filter
  576. def WEB_filter_MTM(ps):
  577. return ' WEB' in ps[0]
  578. prefix_MTM_df[:5]
  579. web_filter_MTM=prefix_MTM_df[list(map(WEB_filter_MTM, prefix_MTM_df.Cause))].fillna('WEB')
  580. frames_MTM = [att_filter_MTM, mal_filter_MTM, dd_filter_MTM, hack_filter_MTM, mail_filter_MTM, web_filter_MTM]
  581. result_MTM = pd.concat(frames_MTM)
  582. result_MTM.sort_values(by=['Frequency'],ascending=False)
  583. # In[ ]: