파이썬 기반의 Prefix span 분석
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

prefixspanTest.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # In[1]:
  4. #!/usr/bin/env python
  5. # coding: utf-8
  6. # In[1]:
  7. import pandas as pd
  8. import numpy as np
  9. from mlxtend.preprocessing import TransactionEncoder
  10. from mlxtend.frequent_patterns import association_rules, fpgrowth
  11. from prefixspan import PrefixSpan
  12. df = pd.read_csv("ts_data_accident-2020_sample.csv", low_memory=False, encoding='ISO-8859-1')
  13. pd.set_option('display.max_columns',None)
  14. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()
  15. len(df)
  16. ##################### NTM section #####################
  17. NTM_df=df[df['ACCD_FIND_MTD_CODE']==1]
  18. NTM_df
  19. # In[2]:
  20. # Pick out it in order to get the asset, risk, intent, black IP out
  21. RISK_V2=NTM_df['RISK_V2']
  22. RISK_V2_FILTERED=RISK_V2.dropna()
  23. ## 결측값 제거.
  24. import json
  25. from pandas import json_normalize
  26. # modified
  27. def get_asset_desc(asset_field):
  28. if asset_field == 'ASSETS_VAL_1':
  29. return '공인-전체IP대역(유선)'
  30. elif asset_field == 'ASSETS_VAL_2':
  31. return '공인-전체IP대역(무선)'
  32. elif asset_field == 'ASSETS_VAL_3':
  33. return '공인-WEB서버'
  34. elif asset_field == 'ASSETS_VAL_4':
  35. return '공인-내부응용서버'
  36. elif asset_field == 'ASSETS_VAL_5':
  37. return '공인-DB서버'
  38. elif asset_field == 'ASSETS_VAL_6':
  39. return '공인-패치서버'
  40. elif asset_field == 'ASSETS_VAL_7':
  41. return '공인-네트워크'
  42. elif asset_field == 'ASSETS_VAL_8':
  43. return '공인-보안'
  44. elif asset_field == 'ASSETS_VAL_9':
  45. return '공인-업무용PC'
  46. elif asset_field == 'ASSETS_VAL_10':
  47. return '공인-비업무용PC'
  48. elif asset_field == 'ASSETS_VAL_11':
  49. return '공인-기타'
  50. elif asset_field == 'ASSETS_VAL_12':
  51. return '사설-전체IP대역(유선)'
  52. elif asset_field == 'ASSETS_VAL_13':
  53. return '사설-전체IP대역(무선)'
  54. elif asset_field == 'ASSETS_VAL_14':
  55. return '사설-WEB서버'
  56. elif asset_field == 'ASSETS_VAL_15':
  57. return '사설-내부응용서버'
  58. elif asset_field == 'ASSETS_VAL_16':
  59. return '사설-DB서버'
  60. elif asset_field == 'ASSETS_VAL_17':
  61. return '사설-패치서버'
  62. elif asset_field == 'ASSETS_VAL_18':
  63. return '사설-네트워크'
  64. elif asset_field == 'ASSETS_VAL_19':
  65. return '사설-보안'
  66. elif asset_field == 'ASSETS_VAL_20':
  67. return '사설-업무용PC'
  68. elif asset_field == 'ASSETS_VAL_21':
  69. return '사설-비업무용PC'
  70. elif asset_field == 'ASSETS_VAL_22':
  71. return '사설-기타'
  72. else:
  73. return ''
  74. def get_intent_desc(intent_field):
  75. if intent_field == 'INTENT_VAL_1':
  76. return '파괴'
  77. elif intent_field == 'INTENT_VAL_2':
  78. return '유출'
  79. elif intent_field == 'INTENT_VAL_3':
  80. return '지연'
  81. elif intent_field == 'INTENT_VAL_4':
  82. return '잠복'
  83. elif intent_field == 'INTENT_VAL_5':
  84. return '단순침입'
  85. elif intent_field == 'INTENT_VAL_6':
  86. return 'MD5'
  87. elif intent_field == 'INTENT_VAL_0':
  88. return 'Default'
  89. else:
  90. return ''
  91. def get_source_desc(source_field):
  92. if source_field=='SOURCE_VAL_1':
  93. return '북한IP'
  94. if source_field=='SOURCE_VAL_3':
  95. return 'ECSC Black IP'
  96. else:
  97. return ''
  98. # New assets column
  99. ## ASSETS_VAL을 아예 JSON항목으로 만들어서 새로운 데이터프레임으로 생성.
  100. risk_df = pd.DataFrame()
  101. for risk in RISK_V2_FILTERED:
  102. risk = risk.replace("'", "\"") #json으로 만들려고.
  103. json_string = json.loads(risk)
  104. json_df = json_normalize(json_string)
  105. risk_df = pd.concat([risk_df,json_df],ignore_index=True) #DataFrame 합쳐주기. ignore_index = True를 해야 index가 재구성 된다.
  106. risk_df_column_names = risk_df.columns
  107. assets_df = []
  108. intents_df = []
  109. sources_df = []
  110. def filter_all(risk):
  111. for i in range(0,len(risk)):
  112. risks=[]
  113. intents=[]
  114. sources=[]
  115. for column in risk_df_column_names:
  116. # filter_asset
  117. if 'ASSETS_VAL_' in column and risk.iloc[i][column]:
  118. risk_key_desc = 'RISK_V2.' + column + '=' + get_asset_desc(column)
  119. risks.append(risk_key_desc)
  120. # filter_intent
  121. if 'INTENT_VAL_' in column and risk.iloc[i][column]:
  122. intent_key_desc = 'RISK_V2.' + column + '=' + get_intent_desc(column)
  123. intents.append(intent_key_desc)
  124. if 'SOURCE_VAL_' in column and risk.iloc[i][column]:
  125. source_key_desc='RISK_V2.' + column + '=' + get_source_desc(column)
  126. sources.append(source_key_desc)
  127. assets_df.append(risks)
  128. intents_df.append(intents)
  129. sources_df.append(sources)
  130. filter_all(risk_df)
  131. ## 여기까지 내가 만든 것.
  132. # In[3]:
  133. NTM_df['ASSETS_VAL'] = assets_df
  134. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].astype(str)
  135. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].str.replace('[','',regex=True)
  136. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].str.replace(']','',regex=True)
  137. NTM_df['ASSETS_VAL']
  138. # In[4]:
  139. NTM_df['INTENT_VAL'] = intents_df
  140. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].astype(str)
  141. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].str.replace('[','',regex=True)
  142. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].str.replace(']','',regex=True)
  143. NTM_df['INTENT_VAL']
  144. # In[5]:
  145. NTM_df['SOURCE_VAL'] = sources_df
  146. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].astype(str)
  147. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].str.replace('[','',regex=True)
  148. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].str.replace(']','',regex=True)
  149. NTM_df['SOURCE_VAL']
  150. # In[8]:
  151. NTM_df.drop(columns=['RISK_V2'], inplace=True)
  152. # In[12]:
  153. ##################### 여기서부터 진행하시면 됩니다. #####################
  154. ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################
  155. # It should be 13 columns in total
  156. # 1. 기관 INST_NM
  157. # 2. 공격 DRULE_ATT_TYPE_CODE1
  158. # 3. 자산 ASSETS_VAL
  159. # 4. 위협공격ip TW_ATT_IP
  160. # 5. 위협공격port TW_ATT_PORT
  161. # 6. 위협피해ip TW_DMG_IP
  162. # 7. 위협피해port TW_DMG_PORT
  163. # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM
  164. # 9. 공격국가 TW_ATT_CT_NM
  165. # 10. 의도(7개) INTENT_VAL
  166. # 11. IP/URL 가중치 SOURCE_VAL
  167. # 12. 장비 ACCD_FIND_MTD_CODE
  168. # 13. 탐지규칙명 DRULE_NM
  169. NTM_df.isna().sum()
  170. # Change the Nan to zero
  171. NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  172. NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')
  173. NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  174. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)
  175. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)
  176. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)
  177. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)
  178. NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  179. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)
  180. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)
  181. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)
  182. NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')
  183. # Check NaN out again
  184. NTM_df.isna().sum()
  185. copy_df = NTM_df.copy()
  186. copy_df.drop(columns=['ACCD_FIND_MTD_CODE'],inplace=True)
  187. copy_df.columns=['item1','item2','item3','item4','item5','item6','item7','item8','item9','item10','item11','item12']
  188. data_len = len(NTM_df)
  189. hwan_list = []
  190. # Combination
  191. import itertools
  192. arr = ['item1','item2','item3','item4','item5','item6','item7','item8','item9','item10','item11']
  193. nCr = list(itertools.combinations(arr,6))
  194. # item들은 이 순서다.
  195. #item1 = 'INST_NM'
  196. #item2 = 'DRULE_ATT_TYPE_CODE1'
  197. #item3 = 'TW_ATT_IP'
  198. #item4 = 'TW_ATT_PORT'
  199. #item5 = 'TW_DMG_IP'
  200. #item6 = 'TW_DMG_PORT'
  201. #item7 = 'ACCD_DMG_PROTO_NM'
  202. #item8 = 'TW_ATT_CT_NM'
  203. #item9 = 'DRULE_NM'
  204. #item10 = 'ASSETS_VAL'
  205. #item11 = 'INTENT_VAL'
  206. #item12 = 'SOURCE_VAL'
  207. for i in range(0,data_len):
  208. # item들은 이 순서다.
  209. item1 = NTM_df.loc[i]['INST_NM']
  210. item2 = NTM_df.loc[i]['DRULE_ATT_TYPE_CODE1']
  211. item3 = NTM_df.loc[i]['TW_ATT_IP']
  212. item4 = NTM_df.loc[i]['TW_ATT_PORT']
  213. item5 = NTM_df.loc[i]['TW_DMG_IP']
  214. item6 = NTM_df.loc[i]['TW_DMG_PORT']
  215. item7 = NTM_df.loc[i]['ACCD_DMG_PROTO_NM']
  216. item8 = NTM_df.loc[i]['TW_ATT_CT_NM']
  217. item9 = NTM_df.loc[i]['DRULE_NM']
  218. item10 = NTM_df.loc[i]['ASSETS_VAL']
  219. item11 = NTM_df.loc[i]['INTENT_VAL']
  220. item12 = NTM_df.loc[i]['SOURCE_VAL']
  221. not_null_arr = []
  222. ## 리스트안에 빈 값을 빼버리자.
  223. null_check_list = [item1,item2,item3,item4,item5,item6,item7,item8,item9,item10,item11,item12]
  224. for item in null_check_list:
  225. if item and item != '[]':
  226. not_null_arr.append(item)
  227. hwan_list.append(not_null_arr)
  228. new_ps = PrefixSpan(hwan_list)
  229. copy_df
  230. # In[23]:
  231. comlist = []
  232. for n in range(0,3):
  233. for i in range(0,data_len):
  234. itemlist = []
  235. locdata = copy_df.iloc[i]
  236. for item in nCr[n]:
  237. itemlist.append(locdata[item])
  238. comlist.append(itemlist)
  239. comlist #아이템들의 조합. nCr을 한 아이템들의 조합들. 이걸로 순서를 찾아보자.
  240. # In[25]:
  241. ## 여기도 내 코드
  242. test_ntm = new_ps.frequent(1,filter = lambda patt, matches:len(patt)>5)
  243. test_ntm_df = pd.DataFrame(test_ntm)
  244. test_ntm_df.rename(columns={0:'Frequency', 1:'Cause'}, inplace=True)
  245. # Make the new column for filling the Effect
  246. test_ntm_df['Effect']=np.nan
  247. # Change the order of columns
  248. test_ntm_df=test_ntm_df[['Cause','Effect','Frequency']]
  249. test_sort_values = test_ntm_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True)
  250. ##
  251. # In[26]:
  252. prefix_NTM_df = test_sort_values.copy()
  253. prefix_NTM_df
  254. # In[ ]:
  255. # Define the function that find the rule name
  256. # 데이터 크기를 줄여서 실행해본 결과 정상 작동함.
  257. for i in range(0,len(prefix_NTM_df)):
  258. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  259. loc_value = prefix_NTM_df.loc[i]
  260. for item in prefix_NTM_df.loc[i,'Cause']:
  261. for drule in drules:
  262. if item == drule:
  263. prefix_NTM_df.loc[i,'Effect'] = drule
  264. break
  265. # In[27]:
  266. prefix_NTM_df['Cause'] = [','.join(map(str, word))for word in prefix_NTM_df['Cause']]
  267. # Cause Column을 하나의 string으로 변환.
  268. # In[ ]:
  269. # 정규표현식 사용해서 매칭하기.
  270. # 정규표현식 사용하는 틀. words에 배열만 넣으면 된다.
  271. def regbase(words):
  272. base = r'^{}'
  273. expr = '(?=.*{})'
  274. ret = base.format(''.join(expr.format(w) for w in words))
  275. return ret
  276. for i in range(0,20):
  277. print(comlist[i])
  278. print(prefix_NTM_df[prefix_NTM_df['Cause'].str.contains(regbase(comlist[i]),na=False,regex=True)])
  279. # In[ ]: