파이썬 기반의 Prefix span 분석_fork
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PrefixSpan_20210925.py 19KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # In[1]:
  4. import os
  5. import array
  6. import math
  7. import pickle
  8. # import joblib
  9. import sys
  10. import argparse
  11. import pandas as pd
  12. import numpy as np
  13. import matplotlib.pyplot as plt
  14. from datetime import datetime
  15. from pprint import pprint
  16. import ssl
  17. from elasticsearch.connection import create_ssl_context
  18. from elasticsearch import Elasticsearch
  19. from elasticsearch import helpers
  20. import urllib3
  21. # In[3]:
  22. import pandas as pd
  23. import numpy as np
  24. from mlxtend.preprocessing import TransactionEncoder
  25. from mlxtend.frequent_patterns import association_rules, fpgrowth
  26. from prefixspan import PrefixSpan
  27. # In[4]:
  28. ssl_context = create_ssl_context()
  29. ssl_context.check_hostname = False
  30. ssl_context.verify_mode = ssl.CERT_NONE
  31. # In[12]:
  32. es = Elasticsearch(hosts=[{'host': '223.194.92.152', 'port': 9200}], scheme="http",verify_certs=False, timeout=300, ssl_context=ssl_context, http_auth=("elasticsearch", "hadoop2019@!@#$"))
  33. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  34. # In[347]:
  35. ######## 2020, 1 year ########
  36. ######## There are no MTM data in 2018, 2019 ########
  37. body = {
  38. "size" : 10000,
  39. "query": {
  40. "range":{
  41. "TW_COLLECT_DT":{
  42. "gte":"2020-01-01T00:00:00.625+09:00",
  43. "lte":"2020-12-31T00:00:00.625+09:00" ################
  44. }
  45. }
  46. },
  47. "sort":[{
  48. "_id":"asc"
  49. }]
  50. }
  51. res = es.search(index = 'ts_data_accident-2020', body=body)
  52. data = res['hits']['hits']
  53. nxt=res["hit"]["hit"][-1]["sort"][0]
  54. total = res['hits']['total']
  55. # print(total)
  56. accident = []
  57. for da in data:
  58. att_type = da['_source']
  59. # att_type["POL_NM"]=att_type["SCEN_INFOS"][0]["POL_NM"]
  60. accident.append(att_type)
  61. # df = pd.DataFrame(accident,dtype=str)
  62. df_10000 = pd.DataFrame(accident)
  63. print(df_10000.head())
  64. # In[ ]:
  65. ######## 2020, 1 year ########
  66. ######## There are no MTM data in 2018, 2019 ########
  67. body = {
  68. "size" : 10000,
  69. "search_after":[nxt],
  70. "query": {
  71. "range":{
  72. "TW_COLLECT_DT":{
  73. "gte":"2020-01-01T00:00:00.625+09:00",
  74. "lte":"2020-12-31T00:00:00.625+09:00" ################
  75. }
  76. }
  77. },
  78. "sort":[{
  79. "_id":"asc"
  80. }]
  81. }
  82. res = es.search(index = 'ts_data_accident-2020', body=body)
  83. data = res['hits']['hits']
  84. nxt=res["hit"]["hit"][-1]["sort"][0]
  85. total = res['hits']['total']
  86. # print(total)
  87. accident = []
  88. for da in data:
  89. att_type = da['_source']
  90. # att_type["POL_NM"]=att_type["SCEN_INFOS"][0]["POL_NM"]
  91. accident.append(att_type)
  92. # df = pd.DataFrame(accident,dtype=str)
  93. df_20000 = pd.DataFrame(accident)
  94. print(df_20000.head())
  95. # In[348]:
  96. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()
  97. len(df)
  98. df.head()
  99. # In[349]:
  100. ##################### NTM section #####################
  101. # In[350]:
  102. NTM_df=df[df['ACCD_FIND_MTD_CODE']=='1']
  103. len(NTM_df)
  104. # In[351]:
  105. # Pick out it in order to get the asset, risk, intent, black IP out
  106. RISK_V2=NTM_df['RISK_V2']
  107. RISK_V2_FILTERED=RISK_V2.dropna()
  108. print(RISK_V2.size)
  109. print(RISK_V2_FILTERED.size)
  110. # In[352]:
  111. def filter_assets_value(risk):
  112. risks=[]
  113. try:
  114. for risk_key in risk:
  115. if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  116. risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key)
  117. risks.append(risk_key_desc)
  118. except:
  119. print(risk)
  120. print(type(risk))
  121. finally:
  122. return risks
  123. # In[353]:
  124. # modified
  125. def get_asset_desc(asset_field):
  126. if asset_field == 'ASSETS_VAL_1':
  127. return '공인-전체IP대역(유선)'
  128. elif asset_field == 'ASSETS_VAL_2':
  129. return '공인-전체IP대역(무선)'
  130. elif asset_field == 'ASSETS_VAL_3':
  131. return '공인-WEB서버'
  132. elif asset_field == 'ASSETS_VAL_4':
  133. return '공인-내부응용서버'
  134. elif asset_field == 'ASSETS_VAL_5':
  135. return '공인-DB서버'
  136. elif asset_field == 'ASSETS_VAL_6':
  137. return '공인-패치서버'
  138. elif asset_field == 'ASSETS_VAL_7':
  139. return '공인-네트워크'
  140. elif asset_field == 'ASSETS_VAL_8':
  141. return '공인-보안'
  142. elif asset_field == 'ASSETS_VAL_9':
  143. return '공인-업무용PC'
  144. elif asset_field == 'ASSETS_VAL_10':
  145. return '공인-비업무용PC'
  146. elif asset_field == 'ASSETS_VAL_11':
  147. return '공인-기타'
  148. elif asset_field == 'ASSETS_VAL_12':
  149. return '사설-전체IP대역(유선)'
  150. elif asset_field == 'ASSETS_VAL_13':
  151. return '사설-전체IP대역(무선)'
  152. elif asset_field == 'ASSETS_VAL_14':
  153. return '사설-WEB서버'
  154. elif asset_field == 'ASSETS_VAL_15':
  155. return '사설-내부응용서버'
  156. elif asset_field == 'ASSETS_VAL_16':
  157. return '사설-DB서버'
  158. elif asset_field == 'ASSETS_VAL_17':
  159. return '사설-패치서버'
  160. elif asset_field == 'ASSETS_VAL_18':
  161. return '사설-네트워크'
  162. elif asset_field == 'ASSETS_VAL_19':
  163. return '사설-보안'
  164. elif asset_field == 'ASSETS_VAL_20':
  165. return '사설-업무용PC'
  166. elif asset_field == 'ASSETS_VAL_21':
  167. return '사설-비업무용PC'
  168. elif asset_field == 'ASSETS_VAL_22':
  169. return '사설-기타'
  170. else:
  171. return ''
  172. # In[354]:
  173. # New assets column
  174. NTM_df['ASSETS_VAL']=list(map(filter_assets_value, RISK_V2_FILTERED))
  175. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].astype(str)
  176. NTM_df[:1]
  177. # In[355]:
  178. # modified
  179. def filter_intent(intent):
  180. intents=[]
  181. for intent_key in intent:
  182. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  183. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  184. intents.append(intent_key_desc)
  185. return intents
  186. # In[356]:
  187. def get_intent_desc(intent_field):
  188. if intent_field == 'INTENT_VAL_1':
  189. return '파괴'
  190. elif intent_field == 'INTENT_VAL_2':
  191. return '유출'
  192. elif intent_field == 'INTENT_VAL_3':
  193. return '지연'
  194. elif intent_field == 'INTENT_VAL_4':
  195. return '잠복'
  196. elif intent_field == 'INTENT_VAL_5':
  197. return '단순침입'
  198. elif intent_field == 'INTENT_VAL_6':
  199. return 'MD5'
  200. elif intent_field == 'INTENT_VAL_0':
  201. return 'Default'
  202. else:
  203. return ''
  204. # In[357]:
  205. # New column of intent value
  206. NTM_df['INTENT_VAL']=list(map(filter_intent, RISK_V2_FILTERED))
  207. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].astype(str)
  208. NTM_df[:1]
  209. # In[358]:
  210. # modified
  211. def filter_source(source):
  212. sources=[]
  213. for source_key in source:
  214. if 'SOURCE_VAL_' in source_key and source[source_key]:
  215. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  216. sources.append(source_key_desc)
  217. return sources
  218. # In[359]:
  219. def get_source_desc(source_field):
  220. if source_field=='SOURCE_VAL_1':
  221. return '북한IP'
  222. if source_field=='SOURCE_VAL_3':
  223. return 'ECSC Black IP'
  224. else:
  225. return ''
  226. # In[360]:
  227. # New column of SOURCE_VAL value
  228. NTM_df['SOURCE_VAL']=list(map(filter_source, RISK_V2_FILTERED))
  229. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str)
  230. NTM_df[:5]
  231. # In[361]:
  232. NTM_df.drop(columns=['RISK_V2'], inplace=True)
  233. NTM_df.columns
  234. # In[362]:
  235. # It should be 13 columns in total
  236. # 1. 기관 INST_NM
  237. # 2. 공격 DRULE_ATT_TYPE_CODE1
  238. # 3. 자산 ASSETS_VAL
  239. # 4. 위협공격ip TW_ATT_IP
  240. # 5. 위협공격port TW_ATT_PORT
  241. # 6. 위협피해ip TW_DMG_IP
  242. # 7. 위협피해port TW_DMG_PORT
  243. # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM
  244. # 9. 공격국가 TW_ATT_CT_NM
  245. # 10. 의도(7개) INTENT_VAL
  246. # 11. IP/URL 가중치 SOURCE_VAL
  247. # 12. 장비 ACCD_FIND_MTD_CODE
  248. # 13. 탐지규칙명 DRULE_NM
  249. #
  250. # In[363]:
  251. NTM_df.isna().sum()
  252. # In[364]:
  253. # Change the Nan to zero
  254. NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  255. NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')
  256. NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  257. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)
  258. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)
  259. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)
  260. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)
  261. NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  262. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)
  263. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)
  264. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)
  265. NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')
  266. # In[365]:
  267. # Check NaN out again
  268. NTM_df.isna().sum()
  269. # In[366]:
  270. # # Merge all
  271. # # Make one string from all of elements
  272. NTM_df['Combined']=NTM_df['INST_NM'].astype(str)+' '+NTM_df['TW_ATT_IP'].astype(str)
  273. +' '+NTM_df['TW_ATT_PORT'].astype(str)+' '+NTM_df['TW_DMG_IP'].astype(str)+' '
  274. +NTM_df['TW_DMG_PORT'].astype(str) +' '+NTM_df['ACCD_DMG_PROTO_NM'].astype(str)
  275. +' '+NTM_df['TW_ATT_CT_NM']+' '+NTM_df['ASSETS_VAL']+' '+NTM_df['INTENT_VAL']+' '
  276. +NTM_df['SOURCE_VAL']+' '+NTM_df['DRULE_ATT_TYPE_CODE1']+' '+NTM_df['DRULE_NM']
  277. NTM_com=NTM_df['Combined']
  278. NTM_com[:10]
  279. # In[367]:
  280. # Change the type to DataFrame
  281. NTM_to_df=pd.DataFrame(NTM_com)
  282. NTM_to_df[:5]
  283. # In[368]:
  284. # Change the type to list in order to apply the algorithm(nested list)
  285. NTM_tolist=NTM_to_df.values.tolist()
  286. NTM_tolist[:5]
  287. # In[369]:
  288. from prefixspan import PrefixSpan
  289. # In[370]:
  290. # Apply prefixspan
  291. PrefixSpan_NTM = PrefixSpan(NTM_tolist)
  292. ###### Interchangeable ######
  293. # Get any over frequency 1
  294. prefix_NTM=PrefixSpan_NTM.frequent(1)
  295. prefix_NTM[:3]
  296. # In[371]:
  297. # Put the result to DataFrame
  298. prefix_NTM_df=pd.DataFrame(prefix_NTM)
  299. prefix_NTM_df[:5]
  300. # In[372]:
  301. # Change the columns name
  302. prefix_NTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  303. # Make the new column for filling the Effect
  304. prefix_NTM_df['Effect']=np.nan
  305. # Change the order of columns
  306. prefix_NTM_df=prefix_NTM_df[['Cause','Effect','Frequency']]
  307. prefix_NTM_df[:2]
  308. # In[373]:
  309. # Define the function that find the rule name
  310. def generate_cause(cell):
  311. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  312. for drule in drules:
  313. if ' '+drule in cell[0]:
  314. return drule
  315. return ''
  316. # Mapping the rule name with cause that is the effect
  317. effect=list(map(generate_cause, prefix_NTM_df.Cause))
  318. # Assign the rule name as an effect
  319. prefix_NTM_df['Effect']=effect
  320. prefix_NTM_df.sort_values(by=['Frequency'],ascending=False)
  321. # In[374]:
  322. # Attack Filter
  323. def Attack_filter(ps):
  324. return ' Attack' in ps[0]
  325. att_filter=prefix_NTM_df[list(map(Attack_filter, prefix_NTM_df.Cause))].fillna('Attack')
  326. # Malwr Filter
  327. def Malwr_filter(ps):
  328. return ' Malwr' in ps[0]
  329. mal_filter=prefix_NTM_df[list(map(Malwr_filter, prefix_NTM_df.Cause))].fillna('Malwr')
  330. # DDOS Filter
  331. def DDOS_filter(ps):
  332. return ' DDOS' in ps[0]
  333. dd_filter=prefix_NTM_df[list(map(DDOS_filter, prefix_NTM_df.Cause))].fillna('DDOS')
  334. # HACK Filter
  335. def HACK_filter(ps):
  336. return ' HACK' in ps[0]
  337. hack_filter=prefix_NTM_df[list(map(HACK_filter, prefix_NTM_df.Cause))].fillna('HACK')
  338. # MAIL Filter
  339. def MAIL_filter(ps):
  340. return ' MAIL' in ps[0]
  341. mail_filter=prefix_NTM_df[list(map(MAIL_filter, prefix_NTM_df.Cause))].fillna('MAIL')
  342. # WEB Filter
  343. def WEB_filter(ps):
  344. return ' WEB' in ps[0]
  345. prefix_NTM_df
  346. web_filter=prefix_NTM_df[list(map(WEB_filter, prefix_NTM_df.Cause))].fillna('WEB')
  347. frames = [att_filter, mal_filter, dd_filter, hack_filter, mail_filter, web_filter]
  348. result = pd.concat(frames)
  349. result.sort_values(by=['Frequency'],ascending=False)
  350. # In[ ]:
  351. ##################### NTM section End #####################
  352. # In[ ]:
  353. ##################### MTM section #####################
  354. # In[375]:
  355. MTM_df=df[df['ACCD_FIND_MTD_CODE']=='2']
  356. len(MTM_df)
  357. # In[376]:
  358. # Pick out it in order to get the asset, risk, intent, black IP out
  359. RISK_V2_MTM=MTM_df['RISK_V2']
  360. RISK_V2_FILTERED_MTM=RISK_V2_MTM.dropna()
  361. print(RISK_V2_MTM.size)
  362. print(RISK_V2_FILTERED_MTM.size)
  363. # In[377]:
  364. def filter_assets_value_MTM(risk):
  365. risks=[]
  366. try:
  367. for risk_key in risk:
  368. if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  369. risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key)
  370. risks.append(risk_key_desc)
  371. except:
  372. print(risk)
  373. print(type(risk))
  374. finally:
  375. return risks
  376. # In[378]:
  377. # modified
  378. def get_asset_desc_MTM(asset_field):
  379. if asset_field == 'ASSETS_VAL_1':
  380. return '공인-전체IP대역(유선)'
  381. elif asset_field == 'ASSETS_VAL_2':
  382. return '공인-전체IP대역(무선)'
  383. elif asset_field == 'ASSETS_VAL_3':
  384. return '공인-WEB서버'
  385. elif asset_field == 'ASSETS_VAL_4':
  386. return '공인-내부응용서버'
  387. elif asset_field == 'ASSETS_VAL_5':
  388. return '공인-DB서버'
  389. elif asset_field == 'ASSETS_VAL_6':
  390. return '공인-패치서버'
  391. elif asset_field == 'ASSETS_VAL_7':
  392. return '공인-네트워크'
  393. elif asset_field == 'ASSETS_VAL_8':
  394. return '공인-보안'
  395. elif asset_field == 'ASSETS_VAL_9':
  396. return '공인-업무용PC'
  397. elif asset_field == 'ASSETS_VAL_10':
  398. return '공인-비업무용PC'
  399. elif asset_field == 'ASSETS_VAL_11':
  400. return '공인-기타'
  401. elif asset_field == 'ASSETS_VAL_12':
  402. return '사설-전체IP대역(유선)'
  403. elif asset_field == 'ASSETS_VAL_13':
  404. return '사설-전체IP대역(무선)'
  405. elif asset_field == 'ASSETS_VAL_14':
  406. return '사설-WEB서버'
  407. elif asset_field == 'ASSETS_VAL_15':
  408. return '사설-내부응용서버'
  409. elif asset_field == 'ASSETS_VAL_16':
  410. return '사설-DB서버'
  411. elif asset_field == 'ASSETS_VAL_17':
  412. return '사설-패치서버'
  413. elif asset_field == 'ASSETS_VAL_18':
  414. return '사설-네트워크'
  415. elif asset_field == 'ASSETS_VAL_19':
  416. return '사설-보안'
  417. elif asset_field == 'ASSETS_VAL_20':
  418. return '사설-업무용PC'
  419. elif asset_field == 'ASSETS_VAL_21':
  420. return '사설-비업무용PC'
  421. elif asset_field == 'ASSETS_VAL_22':
  422. return '사설-기타'
  423. else:
  424. return ''
  425. # In[379]:
  426. # New assets column
  427. MTM_df['ASSETS_VAL']=list(map(filter_assets_value_MTM, RISK_V2_FILTERED_MTM))
  428. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].astype(str)
  429. MTM_df[:1]
  430. # In[381]:
  431. # modified
  432. def filter_intent_MTM(intent):
  433. intents=[]
  434. for intent_key in intent:
  435. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  436. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  437. intents.append(intent_key_desc)
  438. return intents
  439. # In[382]:
  440. def get_intent_desc_MTM(intent_field):
  441. if intent_field == 'INTENT_VAL_1':
  442. return '파괴'
  443. elif intent_field == 'INTENT_VAL_2':
  444. return '유출'
  445. elif intent_field == 'INTENT_VAL_3':
  446. return '지연'
  447. elif intent_field == 'INTENT_VAL_4':
  448. return '잠복'
  449. elif intent_field == 'INTENT_VAL_5':
  450. return '단순침입'
  451. elif intent_field == 'INTENT_VAL_6':
  452. return 'MD5'
  453. elif intent_field == 'INTENT_VAL_0':
  454. return 'Default'
  455. else:
  456. return ''
  457. # In[383]:
  458. # New column of intent value
  459. MTM_df['INTENT_VAL']=list(map(filter_intent_MTM, RISK_V2_FILTERED_MTM))
  460. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].astype(str)
  461. MTM_df[:1]
  462. # In[384]:
  463. # modified
  464. def filter_source_MTM(source):
  465. sources=[]
  466. for source_key in source:
  467. if 'SOURCE_VAL_' in source_key and source[source_key]:
  468. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  469. sources.append(source_key_desc)
  470. return sources
  471. # In[385]:
  472. def get_source_desc_MTM(source_field):
  473. if source_field=='SOURCE_VAL_1':
  474. return '북한IP'
  475. if source_field=='SOURCE_VAL_3':
  476. return 'ECSC Black IP'
  477. else:
  478. return ''
  479. # In[386]:
  480. # New column of SOURCE_VAL value
  481. MTM_df['SOURCE_VAL']=list(map(filter_source_MTM, RISK_V2_FILTERED_MTM))
  482. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].astype(str)
  483. MTM_df[:5]
  484. # In[387]:
  485. MTM_df.drop(columns=['RISK_V2'], inplace=True)
  486. MTM_df.columns
  487. # In[388]:
  488. MTM_df.isna().sum()
  489. # In[389]:
  490. # Change the Nan to zero
  491. MTM_df['ACCD_DMG_PROTO_NM']=MTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  492. MTM_df['INST_NM']=MTM_df['INST_NM'].replace(np.nan,'')
  493. MTM_df['DRULE_ATT_TYPE_CODE1']=MTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  494. MTM_df['TW_ATT_IP']=MTM_df['TW_ATT_IP'].replace(np.nan,0)
  495. MTM_df['TW_ATT_PORT']=MTM_df['TW_ATT_PORT'].replace(np.nan,0)
  496. MTM_df['TW_DMG_IP']=MTM_df['TW_DMG_IP'].replace(np.nan,0)
  497. MTM_df['TW_DMG_PORT']=MTM_df['TW_DMG_PORT'].replace(np.nan,0)
  498. MTM_df['TW_ATT_CT_NM']=MTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  499. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].replace(np.nan,0)
  500. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].replace(np.nan,0)
  501. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].replace(np.nan,0)
  502. MTM_df['DRULE_NM']=MTM_df['DRULE_NM'].replace(np.nan,'')
  503. # In[390]:
  504. # Check NaN out again
  505. MTM_df.isna().sum()
  506. # In[391]:
  507. # # Merge all
  508. # # Make one string from all of elements
  509. MTM_df['Combined']=MTM_df['INST_NM'].astype(str)+' '+MTM_df['TW_ATT_IP'].astype(str)+' '+MTM_df['TW_ATT_PORT'].astype(str)+' '+MTM_df['TW_DMG_IP'].astype(str)+' '+MTM_df['TW_DMG_PORT'].astype(str) +' '+MTM_df['ACCD_DMG_PROTO_NM'].astype(str)+' '+MTM_df['TW_ATT_CT_NM']+' '+MTM_df['ASSETS_VAL']+' '+MTM_df['INTENT_VAL']+' '+MTM_df['SOURCE_VAL']+' '+MTM_df['DRULE_ATT_TYPE_CODE1']+' '+MTM_df['DRULE_NM']
  510. MTM_com=MTM_df['Combined']
  511. MTM_com[:10]
  512. # In[392]:
  513. # Change the type to DataFrame
  514. MTM_to_df=pd.DataFrame(MTM_com)
  515. MTM_to_df[:5]
  516. # In[393]:
  517. # Change the type to list in order to apply the algorithm(nested list)
  518. MTM_tolist=MTM_to_df.values.tolist()
  519. MTM_tolist[:5]
  520. # In[394]:
  521. # Apply prefixspan
  522. PrefixSpan_MTM = PrefixSpan(MTM_tolist)
  523. ###### Interchangeable ######
  524. # Get any over frequency 1
  525. prefix_MTM=PrefixSpan_MTM.frequent(1)
  526. prefix_MTM[:3]
  527. # In[395]:
  528. # Put the result to DataFrame
  529. prefix_MTM_df=pd.DataFrame(prefix_MTM)
  530. prefix_MTM_df[:5]
  531. # In[396]:
  532. # Change the columns name
  533. prefix_MTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  534. # Make the new column for filling the Effect
  535. prefix_MTM_df['Effect']=np.nan
  536. # Change the order of columns
  537. prefix_MTM_df=prefix_MTM_df[['Cause','Effect','Frequency']]
  538. prefix_MTM_df[:2]
  539. # In[397]:
  540. # Define the function that find the rule name
  541. def generate_cause_MTM(cell):
  542. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  543. for drule in drules:
  544. if ' '+drule in cell[0]:
  545. return drule
  546. return ''
  547. # Mapping the rule name with cause that is the effect
  548. effect_MTM=list(map(generate_cause, prefix_MTM_df.Cause))
  549. # Assign the rule name as an effect
  550. prefix_MTM_df['Effect']=effect_MTM
  551. prefix_MTM_df.sort_values(by=['Frequency'],ascending=False)
  552. # In[399]:
  553. # Attack Filter
  554. def Attack_filter_MTM(ps):
  555. return ' Attack' in ps[0]
  556. att_filter_MTM=prefix_MTM_df[list(map(Attack_filter_MTM, prefix_MTM_df.Cause))].fillna('Attack')
  557. # Malwr Filter
  558. def Malwr_filter_MTM(ps):
  559. return ' Malwr' in ps[0]
  560. mal_filter_MTM=prefix_MTM_df[list(map(Malwr_filter_MTM, prefix_MTM_df.Cause))].fillna('Malwr')
  561. # DDOS Filter
  562. def DDOS_filter_MTM(ps):
  563. return ' DDOS' in ps[0]
  564. dd_filter_MTM=prefix_MTM_df[list(map(DDOS_filter_MTM, prefix_MTM_df.Cause))].fillna('DDOS')
  565. # HACK Filter
  566. def HACK_filter_MTM(ps):
  567. return ' HACK' in ps[0]
  568. hack_filter_MTM=prefix_MTM_df[list(map(HACK_filter_MTM, prefix_MTM_df.Cause))].fillna('HACK')
  569. # MAIL Filter
  570. def MAIL_filter_MTM(ps):
  571. return ' MAIL' in ps[0]
  572. mail_filter_MTM=prefix_MTM_df[list(map(MAIL_filter_MTM, prefix_MTM_df.Cause))].fillna('MAIL')
  573. # WEB Filter
  574. def WEB_filter_MTM(ps):
  575. return ' WEB' in ps[0]
  576. prefix_MTM_df[:5]
  577. web_filter_MTM=prefix_MTM_df[list(map(WEB_filter_MTM, prefix_MTM_df.Cause))].fillna('WEB')
  578. frames_MTM = [att_filter_MTM, mal_filter_MTM, dd_filter_MTM, hack_filter_MTM, mail_filter_MTM, web_filter_MTM]
  579. result_MTM = pd.concat(frames_MTM)
  580. result_MTM.sort_values(by=['Frequency'],ascending=False)
  581. # In[ ]: