파이썬 기반의 Prefix span 분석_fork
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

PrefixSpan_20210925_edit.py 22KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # <p>NTM(유해트래픽 탐지장비)</p>
  4. # <p>MTM(악성파일 탐지장비)</p>
  5. # In[1]:
  6. #!/usr/bin/env python
  7. # coding: utf-8
  8. import pandas as pd
  9. import numpy as np
  10. from mlxtend.preprocessing import TransactionEncoder
  11. from mlxtend.frequent_patterns import association_rules, fpgrowth
  12. from prefixspan import PrefixSpan
  13. # load ts_data_accident-2020_sample.csv
  14. # to prevent dtypewarning, set low_memory=False
  15. df = pd.read_csv('ts_data_accident-2020_sample.csv', low_memory=False)
  16. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()
  17. len(df) #len(df) : 10000, load successful
  18. ##################### NTM section #####################
  19. NTM_df=df[df['ACCD_FIND_MTD_CODE']==1] #* edit'1' to 1
  20. len(NTM_df)
  21. #* NTM_df.head()
  22. # Pick out it in order to get the asset, risk, intent, black IP out
  23. RISK_V2=NTM_df['RISK_V2']
  24. RISK_V2_FILTERED=RISK_V2.dropna()
  25. print(RISK_V2.size)
  26. print(RISK_V2_FILTERED.size)
  27. #* 추가 : 기존 filter_assets_value 사용시 값을 인식하지 못하는 문제 발생 -> RISK_V2를 별도의 df로 수정
  28. import json
  29. from pandas import json_normalize
  30. risk_df = pd.DataFrame()
  31. for newVal in RISK_V2_FILTERED:
  32. newVal = newVal.replace("'", "\"")
  33. newVal_str = json.loads(newVal)
  34. newVal_df = json_normalize(newVal_str)
  35. risk_df = pd.concat([risk_df,newVal_df],ignore_index=True)
  36. risk_df_col = risk_df.columns.values.tolist()
  37. # In[352]:
  38. asset_val = []
  39. intent_val=[]
  40. source_val=[]
  41. def filter_assets_value(risk):
  42. for i in range(len(risk)):
  43. risks=[]
  44. intents=[]
  45. sources=[]
  46. try:
  47. for key in risk_df_col:
  48. if 'ASSETS_VAL_' in key and risk.iloc[i][key]:
  49. risk_key_desc = 'RISK_V2.' + key + '=' + get_asset_desc(key)
  50. risks.append(risk_key_desc)
  51. if 'INTENT_VAL_' in key and risk.iloc[i][key]:
  52. intent_key_desc = 'RISK_V2.' + key + '=' + get_intent_desc(key)
  53. intents.append(intent_key_desc)
  54. if 'SOURCE_VAL_' in key and risk.iloc[i][key]:
  55. source_key_desc='RISK_V2.' + key + '=' + get_source_desc(key)
  56. sources.append(source_key_desc)
  57. except:
  58. print(risk)
  59. print(type(risk))
  60. finally:
  61. asset_val.append(risks)
  62. intent_val.append(intents)
  63. source_val.append(sources)
  64. # modified
  65. def get_asset_desc(asset_field):
  66. if asset_field == 'ASSETS_VAL_1':
  67. return '공인-전체IP대역(유선)'
  68. elif asset_field == 'ASSETS_VAL_2':
  69. return '공인-전체IP대역(무선)'
  70. elif asset_field == 'ASSETS_VAL_3':
  71. return '공인-WEB서버'
  72. elif asset_field == 'ASSETS_VAL_4':
  73. return '공인-내부응용서버'
  74. elif asset_field == 'ASSETS_VAL_5':
  75. return '공인-DB서버'
  76. elif asset_field == 'ASSETS_VAL_6':
  77. return '공인-패치서버'
  78. elif asset_field == 'ASSETS_VAL_7':
  79. return '공인-네트워크'
  80. elif asset_field == 'ASSETS_VAL_8':
  81. return '공인-보안'
  82. elif asset_field == 'ASSETS_VAL_9':
  83. return '공인-업무용PC'
  84. elif asset_field == 'ASSETS_VAL_10':
  85. return '공인-비업무용PC'
  86. elif asset_field == 'ASSETS_VAL_11':
  87. return '공인-기타'
  88. elif asset_field == 'ASSETS_VAL_12':
  89. return '사설-전체IP대역(유선)'
  90. elif asset_field == 'ASSETS_VAL_13':
  91. return '사설-전체IP대역(무선)'
  92. elif asset_field == 'ASSETS_VAL_14':
  93. return '사설-WEB서버'
  94. elif asset_field == 'ASSETS_VAL_15':
  95. return '사설-내부응용서버'
  96. elif asset_field == 'ASSETS_VAL_16':
  97. return '사설-DB서버'
  98. elif asset_field == 'ASSETS_VAL_17':
  99. return '사설-패치서버'
  100. elif asset_field == 'ASSETS_VAL_18':
  101. return '사설-네트워크'
  102. elif asset_field == 'ASSETS_VAL_19':
  103. return '사설-보안'
  104. elif asset_field == 'ASSETS_VAL_20':
  105. return '사설-업무용PC'
  106. elif asset_field == 'ASSETS_VAL_21':
  107. return '사설-비업무용PC'
  108. elif asset_field == 'ASSETS_VAL_22':
  109. return '사설-기타'
  110. else:
  111. return ''
  112. # modified
  113. def filter_intent(intent):
  114. intents=[]
  115. for intent_key in intent:
  116. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  117. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  118. intents.append(intent_key_desc)
  119. return intents
  120. # In[356]:
  121. def get_intent_desc(intent_field):
  122. if intent_field == 'INTENT_VAL_1':
  123. return '파괴'
  124. elif intent_field == 'INTENT_VAL_2':
  125. return '유출'
  126. elif intent_field == 'INTENT_VAL_3':
  127. return '지연'
  128. elif intent_field == 'INTENT_VAL_4':
  129. return '잠복'
  130. elif intent_field == 'INTENT_VAL_5':
  131. return '단순침입'
  132. elif intent_field == 'INTENT_VAL_6':
  133. return 'MD5'
  134. elif intent_field == 'INTENT_VAL_0':
  135. return 'Default'
  136. else:
  137. return ''
  138. # In[358]:
  139. # modified
  140. def filter_source(source):
  141. sources=[]
  142. for source_key in source:
  143. if 'SOURCE_VAL_' in source_key and source[source_key]:
  144. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  145. sources.append(source_key_desc)
  146. return sources
  147. # In[359]:
  148. def get_source_desc(source_field):
  149. if source_field=='SOURCE_VAL_1':
  150. return '북한IP'
  151. if source_field=='SOURCE_VAL_3':
  152. return 'ECSC Black IP'
  153. else:
  154. return ''
  155. # In[2]:
  156. filter_assets_value(risk_df)
  157. #뒤에 isna()를 통해 na값을 0으로 바꿔주는 작업을 하므로, 값이 비어있는 경우 [] 대신 비워두기
  158. # New assets column
  159. NTM_df['ASSETS_VAL']= asset_val
  160. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].astype(str)
  161. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].str.replace('[','', regex=False)
  162. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].str.replace(']','', regex=False)
  163. NTM_df[:1]
  164. # New column of intent value
  165. NTM_df['INTENT_VAL']=intent_val
  166. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].astype(str)
  167. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].str.replace('[','',regex=False)
  168. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].str.replace(']','',regex=False)
  169. NTM_df[:1]
  170. # New column of SOURCE_VAL value
  171. NTM_df['SOURCE_VAL']=source_val
  172. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str)
  173. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace('[','',regex=False)
  174. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace(']','',regex=False)
  175. NTM_df[:5]
  176. # In[361]:
  177. NTM_df.drop(columns=['RISK_V2'], inplace=True)
  178. NTM_df.columns
  179. # In[3]:
  180. #data frame의 i번째 row를 list로 저장하여 itertools.combinations로 모든 조합 만들 예정
  181. #TW_ATT_IP와 TW_DMG_IP의 값이 같은 경우 어떤 값과의 관계인지 알 수 없으므로 데이터 가공
  182. NTM_df['TW_ATT_IP']="TW_ATT_IP="+NTM_df['TW_ATT_IP'].astype(str)
  183. NTM_df['TW_ATT_PORT']="TW_ATT_PORT="+NTM_df['TW_ATT_PORT'].astype(str)
  184. NTM_df['TW_DMG_IP']="TW_DMG_IP="+NTM_df['TW_DMG_IP'].astype(str)
  185. NTM_df['TW_DMG_PORT']="TW_DMG_PORT="+NTM_df['TW_DMG_PORT'].astype(str)
  186. # In[4]:
  187. ##################### 여기서부터 진행하시면 됩니다. #####################
  188. ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################
  189. # It should be 13 columns in total
  190. # 1. 기관 INST_NM
  191. # 2. 공격 DRULE_ATT_TYPE_CODE1
  192. # 3. 자산 ASSETS_VAL
  193. # 4. 위협공격ip TW_ATT_IP
  194. # 5. 위협공격port TW_ATT_PORT
  195. # 6. 위협피해ip TW_DMG_IP
  196. # 7. 위협피해port TW_DMG_PORT
  197. # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM
  198. # 9. 공격국가 TW_ATT_CT_NM
  199. # 10. 의도(7개) INTENT_VAL
  200. # 11. IP/URL 가중치 SOURCE_VAL
  201. # 12. 장비 ACCD_FIND_MTD_CODE
  202. # 13. 탐지규칙명 DRULE_NM
  203. # In[363]:
  204. NTM_df.isna().sum()
  205. # Change the Nan to zero
  206. NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  207. NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')
  208. NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  209. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)
  210. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)
  211. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)
  212. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)
  213. NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  214. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)
  215. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)
  216. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)
  217. NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')
  218. # Check NaN out again
  219. NTM_df.isna().sum()
  220. # In[5]:
  221. # NTM_df의 col을 list로 저장. itertools.combinations로 가능한 시나리오 모두 추출
  222. # ACCD_FIND_MTD_CODE col 지우기
  223. NTM_df.drop(columns=['ACCD_FIND_MTD_CODE'], inplace=True)
  224. # In[6]:
  225. # 12의 아이템 중 2개의 조합으로 만들어질 수 있는 모든 시나리오의 갯수 파악
  226. import itertools
  227. item_n=[]
  228. for i in range(2,7):
  229. temp = itertools.combinations(NTM_df.columns.tolist(), i)
  230. item_n.append(list(temp))
  231. #12C4부터 495개의 데이터를 저장하는데에 소요되는 시간이 너무 커서 단순 반복문 사용은 적합하지 않음.
  232. for i in range(len(item_n)):
  233. print("12C" + str(i+2)+" = "+str(len(item_n[i])))
  234. # In[7]:
  235. from prefixspan import PrefixSpan
  236. # arr를 매개변수로 받아 n개의 아이템의 조합 반환
  237. def get_combination(arr, n):
  238. combination_n = list(itertools.combinations(arr.columns.tolist(),n))
  239. com_list=[]
  240. # row i 의 (1,2),(1,3)... 이런식으로 하니까 시간 너무 오래걸림
  241. # (1,2) 조합에 대한 row i, row i+1, row i+2... 이렇게 바꿈
  242. for m in range(len(combination_n[n-2])):
  243. for i in range(len(arr)):
  244. tmp_list=[]
  245. temp_df = arr.iloc[i]
  246. for col in combination_n[m]:
  247. tmp_list.append(temp_df[col])
  248. com_list.append(tmp_list)
  249. return com_list
  250. def get_prefixspan(n, load_list, save_list, save_df):
  251. save_list = PrefixSpan(load_list)
  252. #n개 아이템 조합으로 이루어졌는데 n보다 작은 갯수의 아이템으로 이루어진 prefixspan 결과 값 나옴
  253. # 방지하기 위해 prefixspan의 결과값에는 'n개의 아이템의 값'이 다 들어가도록 filter 설정
  254. save_list = save_list.frequent(1,filter = lambda patt, matches:len(patt)>n)
  255. save_df = pd.DataFrame(save_list)
  256. save_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  257. save_df = save_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True)
  258. save_df = get_effect(save_df)
  259. return save_df
  260. def get_effect(edit_df):
  261. #Make the new column for filling the Effect
  262. edit_df['Effect']=np.nan
  263. #Change the order of columns
  264. edit_df=edit_df[['Cause','Effect','Frequency']]
  265. for i in range(len(edit_df)):
  266. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  267. loc_value = edit_df.loc[i]
  268. for item in loc_value['Cause']:
  269. for drule in drules:
  270. if item == drule:
  271. edit_df.loc[i,'Effect'] = item
  272. return edit_df
  273. # In[8]:
  274. # 1. 두 아이템의 조합
  275. item_of_two = get_combination(NTM_df,2)
  276. prefix_two=[]
  277. prefix_two_df = pd.DataFrame()
  278. prefix_of_two = get_prefixspan(1, item_of_two, prefix_two, prefix_two_df)
  279. prefix_of_two.to_csv('prefix_of_two.csv',sep=',')
  280. # In[9]:
  281. # 2. 세 아이템의 조합
  282. item_of_three = get_combination(NTM_df, 3)
  283. prefix_three_tmp=[]
  284. prefix_three_df = pd.DataFrame()
  285. prefix_of_three = get_prefixspan(2, item_of_three, prefix_three_tmp, prefix_three_df)
  286. prefix_of_three
  287. # In[ ]:
  288. # In[10]:
  289. # 3. 네 아이템의 조합
  290. item_of_four = get_combination(NTM_df, 4)
  291. prefix_four_tmp=[]
  292. prefix_four_df = pd.DataFrame()
  293. prefix_of_four = get_prefixspan(3, item_of_four, prefix_four_tmp, prefix_four_df)
  294. # In[11]:
  295. # 4. 다섯 아이템의 조합
  296. item_of_five = get_combination(NTM_df, 5)
  297. prefix_five_tmp=[]
  298. prefix_five_df = pd.DataFrame()
  299. prefix_of_five = get_prefixspan(4, item_of_five, prefix_five_tmp, prefix_five_df)
  300. prefix_of_five
  301. # In[12]:
  302. # 5. 여섯 아이템의 조합
  303. item_of_six = get_combination(NTM_df, 6)
  304. prefix_six_tmp=[]
  305. prefix_six_df = pd.DataFrame()
  306. prefix_of_six = get_prefixspan(5, item_of_six, prefix_six_tmp, prefix_six_df)
  307. prefix_of_six
  308. ##################### NTM section End #####################
  309. # In[13]:
  310. ##################### MTM section #####################
  311. # Same goes for the MTM section
  312. # In[375]:
  313. MTM_df=df[df['ACCD_FIND_MTD_CODE']==2]
  314. len(MTM_df)
  315. # In[376]:
  316. # Pick out it in order to get the asset, risk, intent, black IP out
  317. RISK_V2_MTM=MTM_df['RISK_V2']
  318. RISK_V2_FILTERED_MTM=RISK_V2_MTM.dropna()
  319. print(RISK_V2_MTM.size)
  320. print(RISK_V2_FILTERED_MTM.size)
  321. risk_df_MTM = pd.DataFrame()
  322. for newVal_MTM in RISK_V2_FILTERED_MTM:
  323. newVal_MTM = newVal_MTM.replace("'", "\"")
  324. newVal_MTM_str = json.loads(newVal_MTM)
  325. newVal_df_MTM = json_normalize(newVal_MTM_str)
  326. risk_df_MTM = pd.concat([risk_df_MTM,newVal_df_MTM],ignore_index=True)
  327. risk_df_col_MTM = risk_df_MTM.columns.values.tolist()
  328. # In[377]:
  329. asset_val_MTM = []
  330. intent_val_MTM=[]
  331. source_val_MTM=[]
  332. def filter_assets_value_MTM(risk):
  333. for i in range(len(risk)):
  334. risks=[]
  335. intents=[]
  336. sources=[]
  337. try:
  338. for key in risk_df_col:
  339. if 'ASSETS_VAL_' in key and risk.iloc[i][key]:
  340. risk_key_desc = 'RISK_V2.' + key + '=' + get_asset_desc(key)
  341. risks.append(risk_key_desc)
  342. if 'INTENT_VAL_' in key and risk.iloc[i][key]:
  343. intent_key_desc = 'RISK_V2.' + key + '=' + get_intent_desc(key)
  344. intents.append(intent_key_desc)
  345. if 'SOURCE_VAL_' in key and risk.iloc[i][key]:
  346. source_key_desc='RISK_V2.' + key + '=' + get_source_desc(key)
  347. sources.append(source_key_desc)
  348. except:
  349. print(risk)
  350. print(type(risk))
  351. finally:
  352. asset_val_MTM.append(risks)
  353. intent_val_MTM.append(intents)
  354. source_val_MTM.append(sources)
  355. # In[378]:
  356. # modified
  357. def get_asset_desc_MTM(asset_field):
  358. if asset_field == 'ASSETS_VAL_1':
  359. return '공인-전체IP대역(유선)'
  360. elif asset_field == 'ASSETS_VAL_2':
  361. return '공인-전체IP대역(무선)'
  362. elif asset_field == 'ASSETS_VAL_3':
  363. return '공인-WEB서버'
  364. elif asset_field == 'ASSETS_VAL_4':
  365. return '공인-내부응용서버'
  366. elif asset_field == 'ASSETS_VAL_5':
  367. return '공인-DB서버'
  368. elif asset_field == 'ASSETS_VAL_6':
  369. return '공인-패치서버'
  370. elif asset_field == 'ASSETS_VAL_7':
  371. return '공인-네트워크'
  372. elif asset_field == 'ASSETS_VAL_8':
  373. return '공인-보안'
  374. elif asset_field == 'ASSETS_VAL_9':
  375. return '공인-업무용PC'
  376. elif asset_field == 'ASSETS_VAL_10':
  377. return '공인-비업무용PC'
  378. elif asset_field == 'ASSETS_VAL_11':
  379. return '공인-기타'
  380. elif asset_field == 'ASSETS_VAL_12':
  381. return '사설-전체IP대역(유선)'
  382. elif asset_field == 'ASSETS_VAL_13':
  383. return '사설-전체IP대역(무선)'
  384. elif asset_field == 'ASSETS_VAL_14':
  385. return '사설-WEB서버'
  386. elif asset_field == 'ASSETS_VAL_15':
  387. return '사설-내부응용서버'
  388. elif asset_field == 'ASSETS_VAL_16':
  389. return '사설-DB서버'
  390. elif asset_field == 'ASSETS_VAL_17':
  391. return '사설-패치서버'
  392. elif asset_field == 'ASSETS_VAL_18':
  393. return '사설-네트워크'
  394. elif asset_field == 'ASSETS_VAL_19':
  395. return '사설-보안'
  396. elif asset_field == 'ASSETS_VAL_20':
  397. return '사설-업무용PC'
  398. elif asset_field == 'ASSETS_VAL_21':
  399. return '사설-비업무용PC'
  400. elif asset_field == 'ASSETS_VAL_22':
  401. return '사설-기타'
  402. else:
  403. return ''
  404. # In[381]:
  405. # modified
  406. def filter_intent_MTM(intent):
  407. intents=[]
  408. for intent_key in intent:
  409. if 'INTENT_VAL_' in intent_key and intent[intent_key]:
  410. intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)
  411. intents.append(intent_key_desc)
  412. return intents
  413. # In[382]:
  414. def get_intent_desc_MTM(intent_field):
  415. if intent_field == 'INTENT_VAL_1':
  416. return '파괴'
  417. elif intent_field == 'INTENT_VAL_2':
  418. return '유출'
  419. elif intent_field == 'INTENT_VAL_3':
  420. return '지연'
  421. elif intent_field == 'INTENT_VAL_4':
  422. return '잠복'
  423. elif intent_field == 'INTENT_VAL_5':
  424. return '단순침입'
  425. elif intent_field == 'INTENT_VAL_6':
  426. return 'MD5'
  427. elif intent_field == 'INTENT_VAL_0':
  428. return 'Default'
  429. else:
  430. return ''
  431. # In[384]:
  432. # modified
  433. def filter_source_MTM(source):
  434. sources=[]
  435. for source_key in source:
  436. if 'SOURCE_VAL_' in source_key and source[source_key]:
  437. source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)
  438. sources.append(source_key_desc)
  439. return sources
  440. # In[385]:
  441. def get_source_desc_MTM(source_field):
  442. if source_field=='SOURCE_VAL_1':
  443. return '북한IP'
  444. if source_field=='SOURCE_VAL_3':
  445. return 'ECSC Black IP'
  446. else:
  447. return ''
  448. # In[386]:
  449. filter_assets_value(risk_df_MTM)
  450. #뒤에 isna()를 통해 na값을 0으로 바꿔주는 작업을 하므로, 값이 비어있는 경우 [] 대신 비워두기
  451. # New assets column
  452. MTM_df['ASSETS_VAL']= asset_val_MTM
  453. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].astype(str)
  454. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].str.replace('[','', regex=False)
  455. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].str.replace(']','', regex=False)
  456. MTM_df[:1]
  457. # New column of intent value
  458. MTM_df['INTENT_VAL']=intent_val_MTM
  459. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].astype(str)
  460. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].str.replace('[','',regex=False)
  461. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].str.replace(']','',regex=False)
  462. MTM_df[:1]
  463. # New column of SOURCE_VAL value
  464. MTM_df['SOURCE_VAL']=source_val_NTN
  465. MTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str)
  466. MTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace('[','',regex=False)
  467. MTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace(']','',regex=False)
  468. MTM_df[:5]
  469. # In[361]:
  470. MTM_df.drop(columns=['RISK_V2'], inplace=True)
  471. MTM_df.columns
  472. # In[388]:
  473. MTM_df.isna().sum()
  474. # In[389]:
  475. # Change the Nan to zero
  476. MTM_df['ACCD_DMG_PROTO_NM']=MTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  477. MTM_df['INST_NM']=MTM_df['INST_NM'].replace(np.nan,'')
  478. MTM_df['DRULE_ATT_TYPE_CODE1']=MTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  479. MTM_df['TW_ATT_IP']=MTM_df['TW_ATT_IP'].replace(np.nan,0)
  480. MTM_df['TW_ATT_PORT']=MTM_df['TW_ATT_PORT'].replace(np.nan,0)
  481. MTM_df['TW_DMG_IP']=MTM_df['TW_DMG_IP'].replace(np.nan,0)
  482. MTM_df['TW_DMG_PORT']=MTM_df['TW_DMG_PORT'].replace(np.nan,0)
  483. MTM_df['TW_ATT_CT_NM']=MTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  484. MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].replace(np.nan,0)
  485. MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].replace(np.nan,0)
  486. MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].replace(np.nan,0)
  487. MTM_df['DRULE_NM']=MTM_df['DRULE_NM'].replace(np.nan,'')
  488. # In[390]:
  489. # Check NaN out again
  490. MTM_df.isna().sum()
  491. # In[391]:
  492. # ACCD_FIND_MTD_CODE col 지우기
  493. MTM_df.drop(columns=['ACCD_FIND_MTD_CODE'], inplace=True)
  494. # arr를 매개변수로 받아 n개의 아이템의 조합 반환
  495. def get_combination_MTM(arr, n):
  496. combination_n = list(itertools.combinations(arr.columns.tolist(),n))
  497. com_list=[]
  498. # row i 의 (1,2),(1,3)... 이런식으로 하니까 시간 너무 오래걸림
  499. # (1,2) 조합에 대한 row i, row i+1, row i+2... 이렇게 바꿈
  500. for m in range(len(combination_n[n-2])):
  501. for i in range(len(arr)):
  502. tmp_list=[]
  503. temp_df = arr.iloc[i]
  504. for col in combination_n[m]:
  505. tmp_list.append(temp_df[col])
  506. com_list.append(tmp_list)
  507. return com_list
  508. def get_prefixspan_MTM(n, load_list, save_list, save_df):
  509. save_list = PrefixSpan(load_list)
  510. #n개 아이템 조합으로 이루어졌는데 n보다 작은 갯수의 아이템으로 이루어진 prefixspan 결과 값 나옴
  511. # 방지하기 위해 prefixspan의 결과값에는 'n개의 아이템의 값'이 다 들어가도록 filter 설정
  512. save_list = save_list.frequent(1,filter = lambda patt, matches:len(patt)>n)
  513. save_df = pd.DataFrame(save_list)
  514. save_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)
  515. save_df = save_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True)
  516. save_df = get_effect(save_df)
  517. return save_df
  518. def get_effect_MTM(edit_df):
  519. #Make the new column for filling the Effect
  520. edit_df['Effect']=np.nan
  521. #Change the order of columns
  522. edit_df=edit_df[['Cause','Effect','Frequency']]
  523. for i in range(len(edit_df)):
  524. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  525. loc_value = edit_df.loc[i]
  526. for item in loc_value['Cause']:
  527. for drule in drules:
  528. if item == drule:
  529. edit_df.loc[i,'Effect'] = item
  530. return edit_df
  531. # 1. 두 아이템의 조합
  532. item_of_two_MTM = get_combination(MTM_df,2)
  533. prefix_two_MTM=[]
  534. prefix_two_df_MTM = pd.DataFrame()
  535. prefix_of_two_MTM = get_prefixspan(1, item_of_two_MTM, prefix_two_MTM, prefix_two_d_MTMf)
  536. prefix_of_two_MTM
  537. # 2. 세 아이템의 조합
  538. item_of_three_MTM = get_combination(MTM_df, 3)
  539. prefix_three_tmp_MTM=[]
  540. prefix_three_df_MTM = pd.DataFrame()
  541. prefix_of_three_MTM = get_prefixspan(2, item_of_three_MTM, prefix_three_tmp_MTM, prefix_three_df_MTM)
  542. prefix_of_three_MTM
  543. # 3. 네 아이템의 조합
  544. item_of_four_MTM = get_combination(MTM_df, 4)
  545. prefix_four_tmp_MTM=[]
  546. prefix_four_df_MTM = pd.DataFrame()
  547. prefix_of_four_MTM = get_prefixspan(3, item_of_four_MTM, prefix_four_tmp_MTM, prefix_four_df_MTM)
  548. # 4. 다섯 아이템의 조합
  549. item_of_five_MTM = get_combination(MTM_df, 5)
  550. prefix_five_tmp_MTM=[]
  551. prefix_five_df_MTM = pd.DataFrame()
  552. prefix_of_five_MTM = get_prefixspan(4, item_of_five_MTM, prefix_five_tmp_MTM, prefix_five_df_MTM)
  553. prefix_of_five_MTM
  554. # 5. 여섯 아이템의 조합
  555. item_of_six_MTM = get_combination(MTM_df, 6)
  556. prefix_six_tmp_MTM=[]
  557. prefix_six_df_MTM = pd.DataFrame()
  558. prefix_of_six_MTM = get_prefixspan(5, item_of_six_MTM, prefix_six_tmp_MTM, prefix_six_df_MTM)
  559. prefix_of_six_MTM
  560. ##################### MTM section End #####################
  561. # In[ ]:
  562. # In[ ]: