파이썬 기반의 Prefix span 분석
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

PrefixSpan_20210925.py 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # In[1]:
  4. import pandas as pd
  5. import numpy as np
  6. from mlxtend.preprocessing import TransactionEncoder
  7. from mlxtend.frequent_patterns import association_rules, fpgrowth
  8. from prefixspan import PrefixSpan
  9. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()
  10. len(df)
  11. df.head()
  12. # In[349]:
  13. ##################### NTM section #####################
  14. # In[350]:
  15. NTM_df=df[df['ACCD_FIND_MTD_CODE']=='1']
  16. len(NTM_df)
  17. # In[351]:
  18. # Pick out it in order to get the asset, risk, intent, black IP out
  19. RISK_V2=NTM_df['RISK_V2']
  20. RISK_V2_FILTERED=RISK_V2.dropna()
  21. print(RISK_V2.size)
  22. print(RISK_V2_FILTERED.size)
  23. # In[352]:
  24. def filter_assets_value(risk):
  25. risks=[]
  26. try:
  27. for risk_key in risk:
  28. if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  29. risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key)
  30. risks.append(risk_key_desc)
  31. except:
  32. print(risk)
  33. print(type(risk))
  34. finally:
  35. return risks
  36. # In[353]:
  37. # modified
  38. def get_asset_desc(asset_field):
  39. if asset_field == 'ASSETS_VAL_1':
  40. return '공인-전체IP대역(유선)'
  41. elif asset_field == 'ASSETS_VAL_2':
  42. return '공인-전체IP대역(무선)'
  43. elif asset_field == 'ASSETS_VAL_3':
  44. return '공인-WEB서버'
  45. elif asset_field == 'ASSETS_VAL_4':
  46. return '공인-내부응용서버'
  47. elif asset_field == 'ASSETS_VAL_5':
  48. return '공인-DB서버'
  49. elif asset_field == 'ASSETS_VAL_6':
  50. return '공인-패치서버'
  51. elif asset_field == 'ASSETS_VAL_7':
  52. return '공인-네트워크'
  53. elif asset_field == 'ASSETS_VAL_8':
  54. return '공인-보안'
  55. elif asset_field == 'ASSETS_VAL_9':
  56. return '공인-업무용PC'
  57. elif asset_field == 'ASSETS_VAL_10':
  58. return '공인-비업무용PC'
  59. elif asset_field == 'ASSETS_VAL_11':
  60. return '공인-기타'
  61. elif asset_field == 'ASSETS_VAL_12':
  62. return '사설-전체IP대역(유선)'
  63. elif asset_field == 'ASSETS_VAL_13':
  64. return '사설-전체IP대역(무선)'
  65. elif asset_field == 'ASSETS_VAL_14':
  66. return '사설-WEB서버'
  67. elif asset_field == 'ASSETS_VAL_15':
  68. return '사설-내부응용서버'
  69. elif asset_field == 'ASSETS_VAL_16':
  70. return '사설-DB서버'
  71. elif asset_field == 'ASSETS_VAL_17':
  72. return '사설-패치서버'
  73. elif asset_field == 'ASSETS_VAL_18':
  74. return '사설-네트워크'
  75. elif asset_field == 'ASSETS_VAL_19':
  76. return '사설-보안'
  77. elif asset_field == 'ASSETS_VAL_20':
  78. return '사설-업무용PC'
  79. elif asset_field == 'ASSETS_VAL_21':
  80. return '사설-비업무용PC'
  81. elif asset_field == 'ASSETS_VAL_22':
  82. return '사설-기타'
  83. else:
  84. return ''
  85. # In[354]:
  86. # New assets column
  87. NTM_df['ASSETS_VAL']=list(map(filter_assets_value, RISK_V2_FILTERED))
  88. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].astype(str)
  89. NTM_df[:1]
  90. # In[355]:
  91. # modified
  92. def filter_intent(intent):
  93. intents=[]
  94. for intent_key in intent:
  95. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  96. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  97. intents.append(intent_key_desc)
  98. return intents
  99. # In[356]:
  100. def get_intent_desc(intent_field):
  101. if intent_field == 'INTENT_VAL_1':
  102. return '파괴'
  103. elif intent_field == 'INTENT_VAL_2':
  104. return '유출'
  105. elif intent_field == 'INTENT_VAL_3':
  106. return '지연'
  107. elif intent_field == 'INTENT_VAL_4':
  108. return '잠복'
  109. elif intent_field == 'INTENT_VAL_5':
  110. return '단순침입'
  111. elif intent_field == 'INTENT_VAL_6':
  112. return 'MD5'
  113. elif intent_field == 'INTENT_VAL_0':
  114. return 'Default'
  115. else:
  116. return ''
  117. # In[357]:
  118. # New column of intent value
  119. NTM_df['INTENT_VAL']=list(map(filter_intent, RISK_V2_FILTERED))
  120. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].astype(str)
  121. NTM_df[:1]
  122. # In[358]:
  123. # modified
  124. def filter_source(source):
  125. sources=[]
  126. for source_key in source:
  127. if 'SOURCE_VAL_' in source_key and source[source_key]:
  128. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  129. sources.append(source_key_desc)
  130. return sources
  131. # In[359]:
  132. def get_source_desc(source_field):
  133. if source_field=='SOURCE_VAL_1':
  134. return '북한IP'
  135. if source_field=='SOURCE_VAL_3':
  136. return 'ECSC Black IP'
  137. else:
  138. return ''
  139. # In[360]:
  140. # New column of SOURCE_VAL value
  141. NTM_df['SOURCE_VAL']=list(map(filter_source, RISK_V2_FILTERED))
  142. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str)
  143. NTM_df[:5]
  144. # In[361]:
  145. NTM_df.drop(columns=['RISK_V2'], inplace=True)
  146. NTM_df.columns
  147. # In[362]:
  148. ##################### 여기서부터 진행하시면 됩니다. #####################
  149. ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################
  150. # It should be 13 columns in total
  151. # 1. 기관 INST_NM
  152. # 2. 공격 DRULE_ATT_TYPE_CODE1
  153. # 3. 자산 ASSETS_VAL
  154. # 4. 위협공격ip TW_ATT_IP
  155. # 5. 위협공격port TW_ATT_PORT
  156. # 6. 위협피해ip TW_DMG_IP
  157. # 7. 위협피해port TW_DMG_PORT
  158. # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM
  159. # 9. 공격국가 TW_ATT_CT_NM
  160. # 10. 의도(7개) INTENT_VAL
  161. # 11. IP/URL 가중치 SOURCE_VAL
  162. # 12. 장비 ACCD_FIND_MTD_CODE
  163. # 13. 탐지규칙명 DRULE_NM
  164. #
  165. # In[363]:
  166. NTM_df.isna().sum()
  167. # In[364]:
  168. # Change the Nan to zero
  169. NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  170. NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')
  171. NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  172. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)
  173. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)
  174. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)
  175. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)
  176. NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  177. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)
  178. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)
  179. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)
  180. NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')
  181. # In[365]:
  182. # Check NaN out again
  183. NTM_df.isna().sum()
  184. # In[366]:
  185. # # Merge all
  186. # # Make one string from all of elements
  187. NTM_df['Combined']=NTM_df['INST_NM'].astype(str)+' '+NTM_df['TW_ATT_IP'].astype(str)
  188. +' '+NTM_df['TW_ATT_PORT'].astype(str)+' '+NTM_df['TW_DMG_IP'].astype(str)+' '
  189. +NTM_df['TW_DMG_PORT'].astype(str) +' '+NTM_df['ACCD_DMG_PROTO_NM'].astype(str)
  190. +' '+NTM_df['TW_ATT_CT_NM']+' '+NTM_df['ASSETS_VAL']+' '+NTM_df['INTENT_VAL']+' '
  191. +NTM_df['SOURCE_VAL']+' '+NTM_df['DRULE_ATT_TYPE_CODE1']+' '+NTM_df['DRULE_NM']
  192. NTM_com=NTM_df['Combined']
  193. NTM_com[:10]
  194. # In[367]:
  195. # Change the type to DataFrame
  196. NTM_to_df=pd.DataFrame(NTM_com)
  197. NTM_to_df[:5]
  198. # In[368]:
  199. # Change the type to list in order to apply the algorithm(nested list)
  200. NTM_tolist=NTM_to_df.values.tolist()
  201. NTM_tolist[:5]
  202. # In[369]:
  203. from prefixspan import PrefixSpan
  204. # In[370]:
  205. # Apply prefixspan
  206. PrefixSpan_NTM = PrefixSpan(NTM_tolist)
  207. ###### Interchangeable ######
  208. # Get any over frequency 1
  209. prefix_NTM=PrefixSpan_NTM.frequent(1)
  210. prefix_NTM[:3]
  211. # In[371]:
  212. # Put the result to DataFrame
  213. prefix_NTM_df=pd.DataFrame(prefix_NTM)
  214. prefix_NTM_df[:5]
  215. # In[372]:
  216. # Change the columns name
  217. prefix_NTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  218. # Make the new column for filling the Effect
  219. prefix_NTM_df['Effect']=np.nan
  220. # Change the order of columns
  221. prefix_NTM_df=prefix_NTM_df[['Cause','Effect','Frequency']]
  222. prefix_NTM_df[:2]
  223. # In[373]:
  224. # Define the function that find the rule name
  225. def generate_cause(cell):
  226. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  227. for drule in drules:
  228. if ' '+drule in cell[0]:
  229. return drule
  230. return ''
  231. # Mapping the rule name with cause that is the effect
  232. effect=list(map(generate_cause, prefix_NTM_df.Cause))
  233. # Assign the rule name as an effect
  234. prefix_NTM_df['Effect']=effect
  235. prefix_NTM_df.sort_values(by=['Frequency'],ascending=False)
  236. # In[374]:
  237. # Attack Filter
  238. def Attack_filter(ps):
  239. return ' Attack' in ps[0]
  240. att_filter=prefix_NTM_df[list(map(Attack_filter, prefix_NTM_df.Cause))].fillna('Attack')
  241. # Malwr Filter
  242. def Malwr_filter(ps):
  243. return ' Malwr' in ps[0]
  244. mal_filter=prefix_NTM_df[list(map(Malwr_filter, prefix_NTM_df.Cause))].fillna('Malwr')
  245. # DDOS Filter
  246. def DDOS_filter(ps):
  247. return ' DDOS' in ps[0]
  248. dd_filter=prefix_NTM_df[list(map(DDOS_filter, prefix_NTM_df.Cause))].fillna('DDOS')
  249. # HACK Filter
  250. def HACK_filter(ps):
  251. return ' HACK' in ps[0]
  252. hack_filter=prefix_NTM_df[list(map(HACK_filter, prefix_NTM_df.Cause))].fillna('HACK')
  253. # MAIL Filter
  254. def MAIL_filter(ps):
  255. return ' MAIL' in ps[0]
  256. mail_filter=prefix_NTM_df[list(map(MAIL_filter, prefix_NTM_df.Cause))].fillna('MAIL')
  257. # WEB Filter
  258. def WEB_filter(ps):
  259. return ' WEB' in ps[0]
  260. prefix_NTM_df
  261. web_filter=prefix_NTM_df[list(map(WEB_filter, prefix_NTM_df.Cause))].fillna('WEB')
  262. frames = [att_filter, mal_filter, dd_filter, hack_filter, mail_filter, web_filter]
  263. result = pd.concat(frames)
  264. result.sort_values(by=['Frequency'],ascending=False)
  265. # In[ ]:
  266. ##################### NTM section End #####################
  267. # In[ ]:
  268. ##################### MTM section #####################
  269. # In[375]:
  270. MTM_df=df[df['ACCD_FIND_MTD_CODE']=='2']
  271. len(MTM_df)
  272. # In[376]:
  273. # Pick out it in order to get the asset, risk, intent, black IP out
  274. RISK_V2_MTM=MTM_df['RISK_V2']
  275. RISK_V2_FILTERED_MTM=RISK_V2_MTM.dropna()
  276. print(RISK_V2_MTM.size)
  277. print(RISK_V2_FILTERED_MTM.size)
  278. # In[377]:
  279. def filter_assets_value_MTM(risk):
  280. risks=[]
  281. try:
  282. for risk_key in risk:
  283. if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  284. risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key)
  285. risks.append(risk_key_desc)
  286. except:
  287. print(risk)
  288. print(type(risk))
  289. finally:
  290. return risks
  291. # In[378]:
  292. # modified
  293. def get_asset_desc_MTM(asset_field):
  294. if asset_field == 'ASSETS_VAL_1':
  295. return '공인-전체IP대역(유선)'
  296. elif asset_field == 'ASSETS_VAL_2':
  297. return '공인-전체IP대역(무선)'
  298. elif asset_field == 'ASSETS_VAL_3':
  299. return '공인-WEB서버'
  300. elif asset_field == 'ASSETS_VAL_4':
  301. return '공인-내부응용서버'
  302. elif asset_field == 'ASSETS_VAL_5':
  303. return '공인-DB서버'
  304. elif asset_field == 'ASSETS_VAL_6':
  305. return '공인-패치서버'
  306. elif asset_field == 'ASSETS_VAL_7':
  307. return '공인-네트워크'
  308. elif asset_field == 'ASSETS_VAL_8':
  309. return '공인-보안'
  310. elif asset_field == 'ASSETS_VAL_9':
  311. return '공인-업무용PC'
  312. elif asset_field == 'ASSETS_VAL_10':
  313. return '공인-비업무용PC'
  314. elif asset_field == 'ASSETS_VAL_11':
  315. return '공인-기타'
  316. elif asset_field == 'ASSETS_VAL_12':
  317. return '사설-전체IP대역(유선)'
  318. elif asset_field == 'ASSETS_VAL_13':
  319. return '사설-전체IP대역(무선)'
  320. elif asset_field == 'ASSETS_VAL_14':
  321. return '사설-WEB서버'
  322. elif asset_field == 'ASSETS_VAL_15':
  323. return '사설-내부응용서버'
  324. elif asset_field == 'ASSETS_VAL_16':
  325. return '사설-DB서버'
  326. elif asset_field == 'ASSETS_VAL_17':
  327. return '사설-패치서버'
  328. elif asset_field == 'ASSETS_VAL_18':
  329. return '사설-네트워크'
  330. elif asset_field == 'ASSETS_VAL_19':
  331. return '사설-보안'
  332. elif asset_field == 'ASSETS_VAL_20':
  333. return '사설-업무용PC'
  334. elif asset_field == 'ASSETS_VAL_21':
  335. return '사설-비업무용PC'
  336. elif asset_field == 'ASSETS_VAL_22':
  337. return '사설-기타'
  338. else:
  339. return ''
  340. # In[379]:
  341. # New assets column
  342. MTM_df['ASSETS_VAL']=list(map(filter_assets_value_MTM, RISK_V2_FILTERED_MTM))
  343. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].astype(str)
  344. MTM_df[:1]
  345. # In[381]:
  346. # modified
  347. def filter_intent_MTM(intent):
  348. intents=[]
  349. for intent_key in intent:
  350. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  351. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  352. intents.append(intent_key_desc)
  353. return intents
  354. # In[382]:
  355. def get_intent_desc_MTM(intent_field):
  356. if intent_field == 'INTENT_VAL_1':
  357. return '파괴'
  358. elif intent_field == 'INTENT_VAL_2':
  359. return '유출'
  360. elif intent_field == 'INTENT_VAL_3':
  361. return '지연'
  362. elif intent_field == 'INTENT_VAL_4':
  363. return '잠복'
  364. elif intent_field == 'INTENT_VAL_5':
  365. return '단순침입'
  366. elif intent_field == 'INTENT_VAL_6':
  367. return 'MD5'
  368. elif intent_field == 'INTENT_VAL_0':
  369. return 'Default'
  370. else:
  371. return ''
  372. # In[383]:
  373. # New column of intent value
  374. MTM_df['INTENT_VAL']=list(map(filter_intent_MTM, RISK_V2_FILTERED_MTM))
  375. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].astype(str)
  376. MTM_df[:1]
  377. # In[384]:
  378. # modified
  379. def filter_source_MTM(source):
  380. sources=[]
  381. for source_key in source:
  382. if 'SOURCE_VAL_' in source_key and source[source_key]:
  383. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  384. sources.append(source_key_desc)
  385. return sources
  386. # In[385]:
  387. def get_source_desc_MTM(source_field):
  388. if source_field=='SOURCE_VAL_1':
  389. return '북한IP'
  390. if source_field=='SOURCE_VAL_3':
  391. return 'ECSC Black IP'
  392. else:
  393. return ''
  394. # In[386]:
  395. # New column of SOURCE_VAL value
  396. MTM_df['SOURCE_VAL']=list(map(filter_source_MTM, RISK_V2_FILTERED_MTM))
  397. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].astype(str)
  398. MTM_df[:5]
  399. # In[387]:
  400. MTM_df.drop(columns=['RISK_V2'], inplace=True)
  401. MTM_df.columns
  402. # In[388]:
  403. MTM_df.isna().sum()
  404. # In[389]:
  405. # Change the Nan to zero
  406. MTM_df['ACCD_DMG_PROTO_NM']=MTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  407. MTM_df['INST_NM']=MTM_df['INST_NM'].replace(np.nan,'')
  408. MTM_df['DRULE_ATT_TYPE_CODE1']=MTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  409. MTM_df['TW_ATT_IP']=MTM_df['TW_ATT_IP'].replace(np.nan,0)
  410. MTM_df['TW_ATT_PORT']=MTM_df['TW_ATT_PORT'].replace(np.nan,0)
  411. MTM_df['TW_DMG_IP']=MTM_df['TW_DMG_IP'].replace(np.nan,0)
  412. MTM_df['TW_DMG_PORT']=MTM_df['TW_DMG_PORT'].replace(np.nan,0)
  413. MTM_df['TW_ATT_CT_NM']=MTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  414. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].replace(np.nan,0)
  415. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].replace(np.nan,0)
  416. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].replace(np.nan,0)
  417. MTM_df['DRULE_NM']=MTM_df['DRULE_NM'].replace(np.nan,'')
  418. # In[390]:
  419. # Check NaN out again
  420. MTM_df.isna().sum()
  421. # In[391]:
  422. # # Merge all
  423. # # Make one string from all of elements
  424. MTM_df['Combined']=MTM_df['INST_NM'].astype(str)+' '+MTM_df['TW_ATT_IP'].astype(str)+' '+MTM_df['TW_ATT_PORT'].astype(str)+' '+MTM_df['TW_DMG_IP'].astype(str)+' '+MTM_df['TW_DMG_PORT'].astype(str) +' '+MTM_df['ACCD_DMG_PROTO_NM'].astype(str)+' '+MTM_df['TW_ATT_CT_NM']+' '+MTM_df['ASSETS_VAL']+' '+MTM_df['INTENT_VAL']+' '+MTM_df['SOURCE_VAL']+' '+MTM_df['DRULE_ATT_TYPE_CODE1']+' '+MTM_df['DRULE_NM']
  425. MTM_com=MTM_df['Combined']
  426. MTM_com[:10]
  427. # In[392]:
  428. # Change the type to DataFrame
  429. MTM_to_df=pd.DataFrame(MTM_com)
  430. MTM_to_df[:5]
  431. # In[393]:
  432. # Change the type to list in order to apply the algorithm(nested list)
  433. MTM_tolist=MTM_to_df.values.tolist()
  434. MTM_tolist[:5]
  435. # In[394]:
  436. # Apply prefixspan
  437. PrefixSpan_MTM = PrefixSpan(MTM_tolist)
  438. ###### Interchangeable ######
  439. # Get any over frequency 1
  440. prefix_MTM=PrefixSpan_MTM.frequent(1)
  441. prefix_MTM[:3]
  442. # In[395]:
  443. # Put the result to DataFrame
  444. prefix_MTM_df=pd.DataFrame(prefix_MTM)
  445. prefix_MTM_df[:5]
  446. # In[396]:
  447. # Change the columns name
  448. prefix_MTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  449. # Make the new column for filling the Effect
  450. prefix_MTM_df['Effect']=np.nan
  451. # Change the order of columns
  452. prefix_MTM_df=prefix_MTM_df[['Cause','Effect','Frequency']]
  453. prefix_MTM_df[:2]
  454. # In[397]:
  455. # Define the function that find the rule name
  456. def generate_cause_MTM(cell):
  457. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  458. for drule in drules:
  459. if ' '+drule in cell[0]:
  460. return drule
  461. return ''
  462. # Mapping the rule name with cause that is the effect
  463. effect_MTM=list(map(generate_cause, prefix_MTM_df.Cause))
  464. # Assign the rule name as an effect
  465. prefix_MTM_df['Effect']=effect_MTM
  466. prefix_MTM_df.sort_values(by=['Frequency'],ascending=False)
  467. # In[399]:
  468. # Attack Filter
  469. def Attack_filter_MTM(ps):
  470. return ' Attack' in ps[0]
  471. att_filter_MTM=prefix_MTM_df[list(map(Attack_filter_MTM, prefix_MTM_df.Cause))].fillna('Attack')
  472. # Malwr Filter
  473. def Malwr_filter_MTM(ps):
  474. return ' Malwr' in ps[0]
  475. mal_filter_MTM=prefix_MTM_df[list(map(Malwr_filter_MTM, prefix_MTM_df.Cause))].fillna('Malwr')
  476. # DDOS Filter
  477. def DDOS_filter_MTM(ps):
  478. return ' DDOS' in ps[0]
  479. dd_filter_MTM=prefix_MTM_df[list(map(DDOS_filter_MTM, prefix_MTM_df.Cause))].fillna('DDOS')
  480. # HACK Filter
  481. def HACK_filter_MTM(ps):
  482. return ' HACK' in ps[0]
  483. hack_filter_MTM=prefix_MTM_df[list(map(HACK_filter_MTM, prefix_MTM_df.Cause))].fillna('HACK')
  484. # MAIL Filter
  485. def MAIL_filter_MTM(ps):
  486. return ' MAIL' in ps[0]
  487. mail_filter_MTM=prefix_MTM_df[list(map(MAIL_filter_MTM, prefix_MTM_df.Cause))].fillna('MAIL')
  488. # WEB Filter
  489. def WEB_filter_MTM(ps):
  490. return ' WEB' in ps[0]
  491. prefix_MTM_df[:5]
  492. web_filter_MTM=prefix_MTM_df[list(map(WEB_filter_MTM, prefix_MTM_df.Cause))].fillna('WEB')
  493. frames_MTM = [att_filter_MTM, mal_filter_MTM, dd_filter_MTM, hack_filter_MTM, mail_filter_MTM, web_filter_MTM]
  494. result_MTM = pd.concat(frames_MTM)
  495. result_MTM.sort_values(by=['Frequency'],ascending=False)
  496. # In[ ]: