파이썬 기반의 Prefix span 분석
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

PrefixSpan_20210925.py 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # In[1]:
  4. import pandas as pd
  5. import numpy as np
  6. from mlxtend.preprocessing import TransactionEncoder
  7. from mlxtend.frequent_patterns import association_rules, fpgrowth
  8. from prefixspan import PrefixSpan
  9. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()
  10. len(df)
  11. df.head()
  12. # In[349]:
  13. ##################### NTM section #####################
  14. # In[350]:
  15. NTM_df=df[df['ACCD_FIND_MTD_CODE']=='1']
  16. len(NTM_df)
  17. # In[351]:
  18. # Pick out it in order to get the asset, risk, intent, black IP out
  19. RISK_V2=NTM_df['RISK_V2']
  20. RISK_V2_FILTERED=RISK_V2.dropna()
  21. print(RISK_V2.size)
  22. print(RISK_V2_FILTERED.size)
  23. # In[352]:
  24. def filter_assets_value(risk):
  25. risks=[]
  26. try:
  27. for risk_key in risk:
  28. if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  29. risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key)
  30. risks.append(risk_key_desc)
  31. except:
  32. print(risk)
  33. print(type(risk))
  34. finally:
  35. return risks
  36. # In[353]:
  37. # modified
  38. def get_asset_desc(asset_field):
  39. if asset_field == 'ASSETS_VAL_1':
  40. return '공인-전체IP대역(유선)'
  41. elif asset_field == 'ASSETS_VAL_2':
  42. return '공인-전체IP대역(무선)'
  43. elif asset_field == 'ASSETS_VAL_3':
  44. return '공인-WEB서버'
  45. elif asset_field == 'ASSETS_VAL_4':
  46. return '공인-내부응용서버'
  47. elif asset_field == 'ASSETS_VAL_5':
  48. return '공인-DB서버'
  49. elif asset_field == 'ASSETS_VAL_6':
  50. return '공인-패치서버'
  51. elif asset_field == 'ASSETS_VAL_7':
  52. return '공인-네트워크'
  53. elif asset_field == 'ASSETS_VAL_8':
  54. return '공인-보안'
  55. elif asset_field == 'ASSETS_VAL_9':
  56. return '공인-업무용PC'
  57. elif asset_field == 'ASSETS_VAL_10':
  58. return '공인-비업무용PC'
  59. elif asset_field == 'ASSETS_VAL_11':
  60. return '공인-기타'
  61. elif asset_field == 'ASSETS_VAL_12':
  62. return '사설-전체IP대역(유선)'
  63. elif asset_field == 'ASSETS_VAL_13':
  64. return '사설-전체IP대역(무선)'
  65. elif asset_field == 'ASSETS_VAL_14':
  66. return '사설-WEB서버'
  67. elif asset_field == 'ASSETS_VAL_15':
  68. return '사설-내부응용서버'
  69. elif asset_field == 'ASSETS_VAL_16':
  70. return '사설-DB서버'
  71. elif asset_field == 'ASSETS_VAL_17':
  72. return '사설-패치서버'
  73. elif asset_field == 'ASSETS_VAL_18':
  74. return '사설-네트워크'
  75. elif asset_field == 'ASSETS_VAL_19':
  76. return '사설-보안'
  77. elif asset_field == 'ASSETS_VAL_20':
  78. return '사설-업무용PC'
  79. elif asset_field == 'ASSETS_VAL_21':
  80. return '사설-비업무용PC'
  81. elif asset_field == 'ASSETS_VAL_22':
  82. return '사설-기타'
  83. else:
  84. return ''
  85. # In[354]:
  86. # New assets column
  87. NTM_df['ASSETS_VAL']=list(map(filter_assets_value, RISK_V2_FILTERED))
  88. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].astype(str)
  89. NTM_df[:1]
  90. # In[355]:
  91. # modified
  92. def filter_intent(intent):
  93. intents=[]
  94. for intent_key in intent:
  95. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  96. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  97. intents.append(intent_key_desc)
  98. return intents
  99. # In[356]:
  100. def get_intent_desc(intent_field):
  101. if intent_field == 'INTENT_VAL_1':
  102. return '파괴'
  103. elif intent_field == 'INTENT_VAL_2':
  104. return '유출'
  105. elif intent_field == 'INTENT_VAL_3':
  106. return '지연'
  107. elif intent_field == 'INTENT_VAL_4':
  108. return '잠복'
  109. elif intent_field == 'INTENT_VAL_5':
  110. return '단순침입'
  111. elif intent_field == 'INTENT_VAL_6':
  112. return 'MD5'
  113. elif intent_field == 'INTENT_VAL_0':
  114. return 'Default'
  115. else:
  116. return ''
  117. # In[357]:
  118. # New column of intent value
  119. NTM_df['INTENT_VAL']=list(map(filter_intent, RISK_V2_FILTERED))
  120. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].astype(str)
  121. NTM_df[:1]
  122. # In[358]:
  123. # modified
  124. def filter_source(source):
  125. sources=[]
  126. for source_key in source:
  127. if 'SOURCE_VAL_' in source_key and source[source_key]:
  128. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  129. sources.append(source_key_desc)
  130. return sources
  131. # In[359]:
  132. def get_source_desc(source_field):
  133. if source_field=='SOURCE_VAL_1':
  134. return '북한IP'
  135. if source_field=='SOURCE_VAL_3':
  136. return 'ECSC Black IP'
  137. else:
  138. return ''
  139. # In[360]:
  140. # New column of SOURCE_VAL value
  141. NTM_df['SOURCE_VAL']=list(map(filter_source, RISK_V2_FILTERED))
  142. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str)
  143. NTM_df[:5]
  144. # In[361]:
  145. NTM_df.drop(columns=['RISK_V2'], inplace=True)
  146. NTM_df.columns
  147. # In[362]:
  148. ##################### 여기서부터 진행하시면 됩니다. #####################
  149. ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################
  150. # It should be 13 columns in total
  151. # 1. 기관 INST_NM
  152. # 2. 공격 DRULE_ATT_TYPE_CODE1
  153. # 3. 자산 ASSETS_VAL
  154. # 4. 위협공격ip TW_ATT_IP
  155. # 5. 위협공격port TW_ATT_PORT
  156. # 6. 위협피해ip TW_DMG_IP
  157. # 7. 위협피해port TW_DMG_PORT
  158. # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM
  159. # 9. 공격국가 TW_ATT_CT_NM
  160. # 10. 의도(7개) INTENT_VAL
  161. # 11. IP/URL 가중치 SOURCE_VAL
  162. # 12. 장비 ACCD_FIND_MTD_CODE
  163. # 13. 탐지규칙명 DRULE_NM
  164. #
  165. # In[363]:
  166. NTM_df.isna().sum()
  167. # In[364]:
  168. # Change the Nan to zero
  169. NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  170. NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')
  171. NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  172. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)
  173. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)
  174. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)
  175. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)
  176. NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  177. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)
  178. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)
  179. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)
  180. NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')
  181. # In[365]:
  182. # Check NaN out again
  183. NTM_df.isna().sum()
  184. # In[366]:
  185. # # Merge all
  186. # # Make one string from all of elements
  187. NTM_df['Combined']=NTM_df['INST_NM'].astype(str)+' '+NTM_df['TW_ATT_IP'].astype(str)
  188. +' '+NTM_df['TW_ATT_PORT'].astype(str)+' '+NTM_df['TW_DMG_IP'].astype(str)+' '
  189. +NTM_df['TW_DMG_PORT'].astype(str) +' '+NTM_df['ACCD_DMG_PROTO_NM'].astype(str)
  190. +' '+NTM_df['TW_ATT_CT_NM']+' '+NTM_df['ASSETS_VAL']+' '+NTM_df['INTENT_VAL']+' '
  191. +NTM_df['SOURCE_VAL']+' '+NTM_df['DRULE_ATT_TYPE_CODE1']+' '+NTM_df['DRULE_NM']
  192. NTM_com=NTM_df['Combined']
  193. NTM_com[:10]
  194. # In[367]:
  195. # Change the type to DataFrame
  196. NTM_to_df=pd.DataFrame(NTM_com)
  197. NTM_to_df[:5]
  198. # In[368]:
  199. # Change the type to list in order to apply the algorithm(nested list)
  200. NTM_tolist=NTM_to_df.values.tolist()
  201. NTM_tolist[:5]
  202. # In[369]:
  203. from prefixspan import PrefixSpan
  204. # In[370]:
  205. # Apply prefixspan
  206. PrefixSpan_NTM = PrefixSpan(NTM_tolist)
  207. ###### Interchangeable ######
  208. # Get any over frequency 1
  209. prefix_NTM=PrefixSpan_NTM.frequent(1)
  210. prefix_NTM[:3]
  211. # In[371]:
  212. # Put the result to DataFrame
  213. prefix_NTM_df=pd.DataFrame(prefix_NTM)
  214. prefix_NTM_df[:5]
  215. # In[372]:
  216. # Change the columns name
  217. prefix_NTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  218. # Make the new column for filling the Effect
  219. prefix_NTM_df['Effect']=np.nan
  220. # Change the order of columns
  221. prefix_NTM_df=prefix_NTM_df[['Cause','Effect','Frequency']]
  222. prefix_NTM_df[:2]
  223. # In[373]:
  224. # Define the function that find the rule name
  225. def generate_cause(cell):
  226. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  227. for drule in drules:
  228. if ' '+drule in cell[0]:
  229. return drule
  230. return ''
  231. # Mapping the rule name with cause that is the effect
  232. effect=list(map(generate_cause, prefix_NTM_df.Cause))
  233. # Assign the rule name as an effect
  234. prefix_NTM_df['Effect']=effect
  235. prefix_NTM_df.sort_values(by=['Frequency'],ascending=False)
  236. # In[374]:
  237. # Attack Filter
  238. def Attack_filter(ps):
  239. return ' Attack' in ps[0]
  240. att_filter=prefix_NTM_df[list(map(Attack_filter, prefix_NTM_df.Cause))].fillna('Attack')
  241. # Malwr Filter
  242. def Malwr_filter(ps):
  243. return ' Malwr' in ps[0]
  244. mal_filter=prefix_NTM_df[list(map(Malwr_filter, prefix_NTM_df.Cause))].fillna('Malwr')
  245. # DDOS Filter
  246. def DDOS_filter(ps):
  247. return ' DDOS' in ps[0]
  248. dd_filter=prefix_NTM_df[list(map(DDOS_filter, prefix_NTM_df.Cause))].fillna('DDOS')
  249. # HACK Filter
  250. def HACK_filter(ps):
  251. return ' HACK' in ps[0]
  252. hack_filter=prefix_NTM_df[list(map(HACK_filter, prefix_NTM_df.Cause))].fillna('HACK')
  253. # MAIL Filter
  254. def MAIL_filter(ps):
  255. return ' MAIL' in ps[0]
  256. mail_filter=prefix_NTM_df[list(map(MAIL_filter, prefix_NTM_df.Cause))].fillna('MAIL')
  257. # WEB Filter
  258. def WEB_filter(ps):
  259. return ' WEB' in ps[0]
  260. prefix_NTM_df
  261. web_filter=prefix_NTM_df[list(map(WEB_filter, prefix_NTM_df.Cause))].fillna('WEB')
  262. frames = [att_filter, mal_filter, dd_filter, hack_filter, mail_filter, web_filter]
  263. result = pd.concat(frames)
  264. result.sort_values(by=['Frequency'],ascending=False)
  265. # In[ ]:
  266. ##################### NTM section End #####################
  267. # In[ ]:
  268. ##################### MTM section #####################
  269. # In[375]:
  270. MTM_df=df[df['ACCD_FIND_MTD_CODE']=='2']
  271. len(MTM_df)
  272. # In[376]:
  273. # Pick out it in order to get the asset, risk, intent, black IP out
  274. RISK_V2_MTM=MTM_df['RISK_V2']
  275. RISK_V2_FILTERED_MTM=RISK_V2_MTM.dropna()
  276. print(RISK_V2_MTM.size)
  277. print(RISK_V2_FILTERED_MTM.size)
  278. # In[377]:
  279. def filter_assets_value_MTM(risk):
  280. risks=[]
  281. try:
  282. for risk_key in risk:
  283. if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  284. risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key)
  285. risks.append(risk_key_desc)
  286. except:
  287. print(risk)
  288. print(type(risk))
  289. finally:
  290. return risks
  291. # In[378]:
  292. # modified
  293. def get_asset_desc_MTM(asset_field):
  294. if asset_field == 'ASSETS_VAL_1':
  295. return '공인-전체IP대역(유선)'
  296. elif asset_field == 'ASSETS_VAL_2':
  297. return '공인-전체IP대역(무선)'
  298. elif asset_field == 'ASSETS_VAL_3':
  299. return '공인-WEB서버'
  300. elif asset_field == 'ASSETS_VAL_4':
  301. return '공인-내부응용서버'
  302. elif asset_field == 'ASSETS_VAL_5':
  303. return '공인-DB서버'
  304. elif asset_field == 'ASSETS_VAL_6':
  305. return '공인-패치서버'
  306. elif asset_field == 'ASSETS_VAL_7':
  307. return '공인-네트워크'
  308. elif asset_field == 'ASSETS_VAL_8':
  309. return '공인-보안'
  310. elif asset_field == 'ASSETS_VAL_9':
  311. return '공인-업무용PC'
  312. elif asset_field == 'ASSETS_VAL_10':
  313. return '공인-비업무용PC'
  314. elif asset_field == 'ASSETS_VAL_11':
  315. return '공인-기타'
  316. elif asset_field == 'ASSETS_VAL_12':
  317. return '사설-전체IP대역(유선)'
  318. elif asset_field == 'ASSETS_VAL_13':
  319. return '사설-전체IP대역(무선)'
  320. elif asset_field == 'ASSETS_VAL_14':
  321. return '사설-WEB서버'
  322. elif asset_field == 'ASSETS_VAL_15':
  323. return '사설-내부응용서버'
  324. elif asset_field == 'ASSETS_VAL_16':
  325. return '사설-DB서버'
  326. elif asset_field == 'ASSETS_VAL_17':
  327. return '사설-패치서버'
  328. elif asset_field == 'ASSETS_VAL_18':
  329. return '사설-네트워크'
  330. elif asset_field == 'ASSETS_VAL_19':
  331. return '사설-보안'
  332. elif asset_field == 'ASSETS_VAL_20':
  333. return '사설-업무용PC'
  334. elif asset_field == 'ASSETS_VAL_21':
  335. return '사설-비업무용PC'
  336. elif asset_field == 'ASSETS_VAL_22':
  337. return '사설-기타'
  338. else:
  339. return ''
  340. # In[379]:
  341. # New assets column
  342. MTM_df['ASSETS_VAL']=list(map(filter_assets_value_MTM, RISK_V2_FILTERED_MTM))
  343. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].astype(str)
  344. MTM_df[:1]
  345. # In[381]:
  346. # modified
  347. def filter_intent_MTM(intent):
  348. intents=[]
  349. for intent_key in intent:
  350. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  351. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  352. intents.append(intent_key_desc)
  353. return intents
  354. # In[382]:
  355. def get_intent_desc_MTM(intent_field):
  356. if intent_field == 'INTENT_VAL_1':
  357. return '파괴'
  358. elif intent_field == 'INTENT_VAL_2':
  359. return '유출'
  360. elif intent_field == 'INTENT_VAL_3':
  361. return '지연'
  362. elif intent_field == 'INTENT_VAL_4':
  363. return '잠복'
  364. elif intent_field == 'INTENT_VAL_5':
  365. return '단순침입'
  366. elif intent_field == 'INTENT_VAL_6':
  367. return 'MD5'
  368. elif intent_field == 'INTENT_VAL_0':
  369. return 'Default'
  370. else:
  371. return ''
  372. # In[383]:
  373. # New column of intent value
  374. MTM_df['INTENT_VAL']=list(map(filter_intent_MTM, RISK_V2_FILTERED_MTM))
  375. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].astype(str)
  376. MTM_df[:1]
  377. # In[384]:
  378. # modified
  379. def filter_source_MTM(source):
  380. sources=[]
  381. for source_key in source:
  382. if 'SOURCE_VAL_' in source_key and source[source_key]:
  383. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  384. sources.append(source_key_desc)
  385. return sources
  386. # In[385]:
  387. def get_source_desc_MTM(source_field):
  388. if source_field=='SOURCE_VAL_1':
  389. return '북한IP'
  390. if source_field=='SOURCE_VAL_3':
  391. return 'ECSC Black IP'
  392. else:
  393. return ''
  394. # In[386]:
  395. # New column of SOURCE_VAL value
  396. MTM_df['SOURCE_VAL']=list(map(filter_source_MTM, RISK_V2_FILTERED_MTM))
  397. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].astype(str)
  398. MTM_df[:5]
  399. # In[387]:
  400. MTM_df.drop(columns=['RISK_V2'], inplace=True)
  401. MTM_df.columns
  402. # In[388]:
  403. MTM_df.isna().sum()
  404. # In[389]:
  405. # Change the Nan to zero
  406. MTM_df['ACCD_DMG_PROTO_NM']=MTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  407. MTM_df['INST_NM']=MTM_df['INST_NM'].replace(np.nan,'')
  408. MTM_df['DRULE_ATT_TYPE_CODE1']=MTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  409. MTM_df['TW_ATT_IP']=MTM_df['TW_ATT_IP'].replace(np.nan,0)
  410. MTM_df['TW_ATT_PORT']=MTM_df['TW_ATT_PORT'].replace(np.nan,0)
  411. MTM_df['TW_DMG_IP']=MTM_df['TW_DMG_IP'].replace(np.nan,0)
  412. MTM_df['TW_DMG_PORT']=MTM_df['TW_DMG_PORT'].replace(np.nan,0)
  413. MTM_df['TW_ATT_CT_NM']=MTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  414. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].replace(np.nan,0)
  415. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].replace(np.nan,0)
  416. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].replace(np.nan,0)
  417. MTM_df['DRULE_NM']=MTM_df['DRULE_NM'].replace(np.nan,'')
  418. # In[390]:
  419. # Check NaN out again
  420. MTM_df.isna().sum()
  421. # In[391]:
  422. # # Merge all
  423. # # Make one string from all of elements
  424. MTM_df['Combined']=MTM_df['INST_NM'].astype(str)+' '+MTM_df['TW_ATT_IP'].astype(str)+' '+MTM_df['TW_ATT_PORT'].astype(str)+' '+MTM_df['TW_DMG_IP'].astype(str)+' '+MTM_df['TW_DMG_PORT'].astype(str) +' '+MTM_df['ACCD_DMG_PROTO_NM'].astype(str)+' '+MTM_df['TW_ATT_CT_NM']+' '+MTM_df['ASSETS_VAL']+' '+MTM_df['INTENT_VAL']+' '+MTM_df['SOURCE_VAL']+' '+MTM_df['DRULE_ATT_TYPE_CODE1']+' '+MTM_df['DRULE_NM']
  425. MTM_com=MTM_df['Combined']
  426. MTM_com[:10]
  427. # In[392]:
  428. # Change the type to DataFrame
  429. MTM_to_df=pd.DataFrame(MTM_com)
  430. MTM_to_df[:5]
  431. # In[393]:
  432. # Change the type to list in order to apply the algorithm(nested list)
  433. MTM_tolist=MTM_to_df.values.tolist()
  434. MTM_tolist[:5]
  435. # In[394]:
  436. # Apply prefixspan
  437. PrefixSpan_MTM = PrefixSpan(MTM_tolist)
  438. ###### Interchangeable ######
  439. # Get any over frequency 1
  440. prefix_MTM=PrefixSpan_MTM.frequent(1)
  441. prefix_MTM[:3]
  442. # In[395]:
  443. # Put the result to DataFrame
  444. prefix_MTM_df=pd.DataFrame(prefix_MTM)
  445. prefix_MTM_df[:5]
  446. # In[396]:
  447. # Change the columns name
  448. prefix_MTM_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  449. # Make the new column for filling the Effect
  450. prefix_MTM_df['Effect']=np.nan
  451. # Change the order of columns
  452. prefix_MTM_df=prefix_MTM_df[['Cause','Effect','Frequency']]
  453. prefix_MTM_df[:2]
  454. # In[397]:
  455. # Define the function that find the rule name
  456. def generate_cause_MTM(cell):
  457. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  458. for drule in drules:
  459. if ' '+drule in cell[0]:
  460. return drule
  461. return ''
  462. # Mapping the rule name with cause that is the effect
  463. effect_MTM=list(map(generate_cause, prefix_MTM_df.Cause))
  464. # Assign the rule name as an effect
  465. prefix_MTM_df['Effect']=effect_MTM
  466. prefix_MTM_df.sort_values(by=['Frequency'],ascending=False)
  467. # In[399]:
  468. # Attack Filter
  469. def Attack_filter_MTM(ps):
  470. return ' Attack' in ps[0]
  471. att_filter_MTM=prefix_MTM_df[list(map(Attack_filter_MTM, prefix_MTM_df.Cause))].fillna('Attack')
  472. # Malwr Filter
  473. def Malwr_filter_MTM(ps):
  474. return ' Malwr' in ps[0]
  475. mal_filter_MTM=prefix_MTM_df[list(map(Malwr_filter_MTM, prefix_MTM_df.Cause))].fillna('Malwr')
  476. # DDOS Filter
  477. def DDOS_filter_MTM(ps):
  478. return ' DDOS' in ps[0]
  479. dd_filter_MTM=prefix_MTM_df[list(map(DDOS_filter_MTM, prefix_MTM_df.Cause))].fillna('DDOS')
  480. # HACK Filter
  481. def HACK_filter_MTM(ps):
  482. return ' HACK' in ps[0]
  483. hack_filter_MTM=prefix_MTM_df[list(map(HACK_filter_MTM, prefix_MTM_df.Cause))].fillna('HACK')
  484. # MAIL Filter
  485. def MAIL_filter_MTM(ps):
  486. return ' MAIL' in ps[0]
  487. mail_filter_MTM=prefix_MTM_df[list(map(MAIL_filter_MTM, prefix_MTM_df.Cause))].fillna('MAIL')
  488. # WEB Filter
  489. def WEB_filter_MTM(ps):
  490. return ' WEB' in ps[0]
  491. prefix_MTM_df[:5]
  492. web_filter_MTM=prefix_MTM_df[list(map(WEB_filter_MTM, prefix_MTM_df.Cause))].fillna('WEB')
  493. frames_MTM = [att_filter_MTM, mal_filter_MTM, dd_filter_MTM, hack_filter_MTM, mail_filter_MTM, web_filter_MTM]
  494. result_MTM = pd.concat(frames_MTM)
  495. result_MTM.sort_values(by=['Frequency'],ascending=False)
  496. # In[ ]: