파이썬 기반의 Prefix span 분석
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

prefix_span.py 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # In[1]:
  4. import pandas as pd
  5. import numpy as np
  6. from mlxtend.preprocessing import TransactionEncoder
  7. from mlxtend.frequent_patterns import association_rules, fpgrowth
  8. from prefixspan import PrefixSpan
  9. df = pd.read_csv("ts_data_accident-2020_sample.csv", low_memory=False, encoding='ISO-8859-1')
  10. pd.set_option('display.max_columns',None)
  11. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()
  12. len(df)
  13. ##################### NTM section #####################
  14. NTM_df=df[df['ACCD_FIND_MTD_CODE']==1]
  15. NTM_df
  16. # Pick out it in order to get the asset, risk, intent, black IP out
  17. RISK_V2=NTM_df['RISK_V2']
  18. RISK_V2_FILTERED=RISK_V2.dropna()
  19. ## 결측값 제거.
  20. import json
  21. from pandas import json_normalize
  22. # modified
  23. def get_asset_desc(asset_field):
  24. if asset_field == 'ASSETS_VAL_1':
  25. return '공인-전체IP대역(유선)'
  26. elif asset_field == 'ASSETS_VAL_2':
  27. return '공인-전체IP대역(무선)'
  28. elif asset_field == 'ASSETS_VAL_3':
  29. return '공인-WEB서버'
  30. elif asset_field == 'ASSETS_VAL_4':
  31. return '공인-내부응용서버'
  32. elif asset_field == 'ASSETS_VAL_5':
  33. return '공인-DB서버'
  34. elif asset_field == 'ASSETS_VAL_6':
  35. return '공인-패치서버'
  36. elif asset_field == 'ASSETS_VAL_7':
  37. return '공인-네트워크'
  38. elif asset_field == 'ASSETS_VAL_8':
  39. return '공인-보안'
  40. elif asset_field == 'ASSETS_VAL_9':
  41. return '공인-업무용PC'
  42. elif asset_field == 'ASSETS_VAL_10':
  43. return '공인-비업무용PC'
  44. elif asset_field == 'ASSETS_VAL_11':
  45. return '공인-기타'
  46. elif asset_field == 'ASSETS_VAL_12':
  47. return '사설-전체IP대역(유선)'
  48. elif asset_field == 'ASSETS_VAL_13':
  49. return '사설-전체IP대역(무선)'
  50. elif asset_field == 'ASSETS_VAL_14':
  51. return '사설-WEB서버'
  52. elif asset_field == 'ASSETS_VAL_15':
  53. return '사설-내부응용서버'
  54. elif asset_field == 'ASSETS_VAL_16':
  55. return '사설-DB서버'
  56. elif asset_field == 'ASSETS_VAL_17':
  57. return '사설-패치서버'
  58. elif asset_field == 'ASSETS_VAL_18':
  59. return '사설-네트워크'
  60. elif asset_field == 'ASSETS_VAL_19':
  61. return '사설-보안'
  62. elif asset_field == 'ASSETS_VAL_20':
  63. return '사설-업무용PC'
  64. elif asset_field == 'ASSETS_VAL_21':
  65. return '사설-비업무용PC'
  66. elif asset_field == 'ASSETS_VAL_22':
  67. return '사설-기타'
  68. else:
  69. return ''
  70. def get_intent_desc(intent_field):
  71. if intent_field == 'INTENT_VAL_1':
  72. return '파괴'
  73. elif intent_field == 'INTENT_VAL_2':
  74. return '유출'
  75. elif intent_field == 'INTENT_VAL_3':
  76. return '지연'
  77. elif intent_field == 'INTENT_VAL_4':
  78. return '잠복'
  79. elif intent_field == 'INTENT_VAL_5':
  80. return '단순침입'
  81. elif intent_field == 'INTENT_VAL_6':
  82. return 'MD5'
  83. elif intent_field == 'INTENT_VAL_0':
  84. return 'Default'
  85. else:
  86. return ''
  87. def get_source_desc(source_field):
  88. if source_field=='SOURCE_VAL_1':
  89. return '북한IP'
  90. if source_field=='SOURCE_VAL_3':
  91. return 'ECSC Black IP'
  92. else:
  93. return ''
  94. # New assets column
  95. ## ASSETS_VAL을 아예 JSON항목으로 만들어서 새로운 데이터프레임으로 생성.
  96. risk_df = pd.DataFrame()
  97. for risk in RISK_V2_FILTERED:
  98. risk = risk.replace("'", "\"") #json으로 만들려고.
  99. json_string = json.loads(risk)
  100. json_df = json_normalize(json_string)
  101. risk_df = pd.concat([risk_df,json_df],ignore_index=True) #DataFrame 합쳐주기. ignore_index = True를 해야 index가 재구성 된다.
  102. risk_df_column_names = risk_df.columns
  103. assets_df = []
  104. intents_df = []
  105. sources_df = []
  106. def filter_all(risk):
  107. for i in range(0,len(risk)):
  108. risks=[]
  109. intents=[]
  110. sources=[]
  111. for column in risk_df_column_names:
  112. # filter_asset
  113. if 'ASSETS_VAL_' in column and risk.iloc[i][column]:
  114. risk_key_desc = 'RISK_V2.' + column + '=' + get_asset_desc(column)
  115. risks.append(risk_key_desc)
  116. # filter_intent
  117. if 'INTENT_VAL_' in column and risk.iloc[i][column]:
  118. intent_key_desc = 'RISK_V2.' + column + '=' + get_intent_desc(column)
  119. intents.append(intent_key_desc)
  120. if 'SOURCE_VAL_' in column and risk.iloc[i][column]:
  121. source_key_desc='RISK_V2.' + column + '=' + get_source_desc(column)
  122. sources.append(source_key_desc)
  123. assets_df.append(risks)
  124. intents_df.append(intents)
  125. sources_df.append(sources)
  126. filter_all(risk_df)
  127. ## 여기까지 내가 만든 것.
  128. ## ASSETS_VAL 확인
  129. NTM_df['ASSETS_VAL'] = assets_df
  130. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].astype(str)
  131. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].str.replace('[','',regex=True)
  132. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].str.replace(']','',regex=True)
  133. # NTM_df['ASSETS_VAL']
  134. NTM_df['INTENT_VAL'] = intents_df
  135. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].astype(str)
  136. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].str.replace('[','',regex=True)
  137. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].str.replace(']','',regex=True)
  138. # NTM_df['INTENT_VAL']
  139. NTM_df['SOURCE_VAL'] = sources_df
  140. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].astype(str)
  141. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].str.replace('[','',regex=True)
  142. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].str.replace(']','',regex=True)
  143. # NTM_df['SOURCE_VAL']
  144. NTM_df.drop(columns=['RISK_V2'], inplace=True)
  145. ##################### 여기서부터 진행하시면 됩니다. #####################
  146. ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################
  147. # It should be 13 columns in total
  148. # 1. 기관 INST_NM
  149. # 2. 공격 DRULE_ATT_TYPE_CODE1
  150. # 3. 자산 ASSETS_VAL
  151. # 4. 위협공격ip TW_ATT_IP
  152. # 5. 위협공격port TW_ATT_PORT
  153. # 6. 위협피해ip TW_DMG_IP
  154. # 7. 위협피해port TW_DMG_PORT
  155. # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM
  156. # 9. 공격국가 TW_ATT_CT_NM
  157. # 10. 의도(7개) INTENT_VAL
  158. # 11. IP/URL 가중치 SOURCE_VAL
  159. # 12. 장비 ACCD_FIND_MTD_CODE
  160. # 13. 탐지규칙명 DRULE_NM
  161. NTM_df.isna().sum()
  162. # Change the Nan to zero
  163. NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  164. NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')
  165. NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  166. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)
  167. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)
  168. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)
  169. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)
  170. NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  171. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)
  172. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)
  173. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)
  174. NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')
  175. # Check NaN out again
  176. NTM_df.isna().sum()
  177. # # Merge all
  178. # # Make one string from all of elements
  179. NTM_df['Combined']=NTM_df['INST_NM'].astype(str)+' '+NTM_df['TW_ATT_IP'].astype(str)+' '+NTM_df['TW_ATT_PORT'].astype(str)+' '+NTM_df['TW_DMG_IP'].astype(str)+' '+NTM_df['TW_DMG_PORT'].astype(str) +' '+NTM_df['ACCD_DMG_PROTO_NM'].astype(str)+' '+NTM_df['TW_ATT_CT_NM']+' '+NTM_df['ASSETS_VAL']+' '+NTM_df['INTENT_VAL']+' '+NTM_df['SOURCE_VAL']+' '+NTM_df['DRULE_ATT_TYPE_CODE1']+' '+NTM_df['DRULE_NM']
  180. NTM_com=NTM_df['Combined']
  181. ## 내가 만든 컴바인
  182. ## 모든 조합을 돌릴거면, [1,2,3,4,5] 처럼 배열의 원소로 만들어서 넣는게 가장 베스트 아닌가.
  183. ## 그런데 순서를 다 바꿔줘야 하는데, 그건 어떻게 할 것인지 내일 물어보자.
  184. data_len = len(NTM_df)
  185. hwan_list = []
  186. for i in range(0,data_len):
  187. accd_dmg_proto_nm = NTM_df.loc[i]['ACCD_DMG_PROTO_NM']
  188. inst_nm = NTM_df.loc[i]['INST_NM']
  189. drule_att_type_code1 = NTM_df.loc[i]['DRULE_ATT_TYPE_CODE1']
  190. tw_att_ip = NTM_df.loc[i]['TW_ATT_IP']
  191. tw_att_port = NTM_df.loc[i]['TW_ATT_PORT']
  192. tw_dmg_ip = NTM_df.loc[i]['TW_DMG_IP']
  193. tw_dmg_port = NTM_df.loc[i]['TW_DMG_PORT']
  194. tw_att_ct_nm = NTM_df.loc[i]['TW_ATT_CT_NM']
  195. assets_val = NTM_df.loc[i]['ASSETS_VAL']
  196. intent_val = NTM_df.loc[i]['INTENT_VAL']
  197. source_val = NTM_df.loc[i]['SOURCE_VAL']
  198. drule_nm = NTM_df.loc[i]['DRULE_NM']
  199. null_check_list = [accd_dmg_proto_nm, inst_nm, drule_att_type_code1, tw_att_ip, tw_att_port,
  200. tw_dmg_ip, tw_dmg_port, tw_att_ct_nm, assets_val, intent_val, source_val, drule_nm]
  201. not_null_arr = []
  202. ## 리스트안에 빈 값을 빼버리자.
  203. for item in null_check_list:
  204. if item and item != '[]':
  205. not_null_arr.append(item)
  206. hwan_list.append(not_null_arr)
  207. new_ps = PrefixSpan(hwan_list)
  208. # new_ps : hwan_list안에 순서대로 null값을 제외한 모든값들이 [1,2,3,4,5,6] 이런식으로 들어가 있는데,
  209. # 이 값을 PrefixSpan 수행한 코드.
  210. ## 여기도 내 코드
  211. test_ntm = new_ps.frequent(1)
  212. test_ntm_df = pd.DataFrame(test_ntm)
  213. test_ntm_df.rename(columns={0:'Frequency', 1:'Cause'}, inplace=True)
  214. print(test_ntm_df)
  215. test_sort_values = test_ntm_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True)
  216. # test_sort_values : PrefixSpan을 수행하여 Frequency가 나온 값. Frequency 를 기준으로 정렬했는데, 2900만가지나 된다.
  217. # Effect Rule Name Case 1.
  218. prefix_NTM_df = test_sort_values
  219. for i in range(0,len(prefix_NTM_df)):
  220. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  221. for drule in drules:
  222. if drule in prefix_NTM_df.Cause[i]:
  223. prefix_NTM_df.Effect[i] = drule
  224. elif prefix_NTM_df.Cause[i] == NaN:
  225. prefix_NTM_df.Effect[i] = ''
  226. # Effect Rule Name Case 2.
  227. prefix_NTM_df = test_sort_values
  228. for i in range(0,len(prefix_NTM_df)):
  229. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  230. for drule in drules:
  231. iloc_value = prefix_NTM_df.iloc[i]['Cause']
  232. if drule in iloc_value:
  233. prefix_NTM_df.Cause[i] = drule
  234. # Case 1을 20분동안 돌렸는데도 결과가 나오질 않아서 iloc으로 바꾸어 Case2로 돌려보았다. 이것도 오래걸리는 중..
  235. # Rule Name 적용한 코드는 prefix_NTM_df 에 적용.