파이썬 기반의 Prefix span 분석
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

prefixspanTest.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # In[1]:
  4. #!/usr/bin/env python
  5. # coding: utf-8
  6. # In[1]:
  7. import pandas as pd
  8. import numpy as np
  9. from mlxtend.preprocessing import TransactionEncoder
  10. from mlxtend.frequent_patterns import association_rules, fpgrowth
  11. from prefixspan import PrefixSpan
  12. df = pd.read_csv("ts_data_accident-2020_sample.csv", low_memory=False, encoding='ISO-8859-1')
  13. pd.set_option('display.max_columns',None)
  14. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()
  15. len(df)
  16. ##################### NTM section #####################
  17. NTM_df=df[df['ACCD_FIND_MTD_CODE']==1]
  18. NTM_df
  19. # In[2]:
  20. # Pick out it in order to get the asset, risk, intent, black IP out
  21. RISK_V2=NTM_df['RISK_V2']
  22. RISK_V2_FILTERED=RISK_V2.dropna()
  23. ## 결측값 제거.
  24. import json
  25. from pandas import json_normalize
  26. # modified
  27. def get_asset_desc(asset_field):
  28. if asset_field == 'ASSETS_VAL_1':
  29. return '공인-전체IP대역(유선)'
  30. elif asset_field == 'ASSETS_VAL_2':
  31. return '공인-전체IP대역(무선)'
  32. elif asset_field == 'ASSETS_VAL_3':
  33. return '공인-WEB서버'
  34. elif asset_field == 'ASSETS_VAL_4':
  35. return '공인-내부응용서버'
  36. elif asset_field == 'ASSETS_VAL_5':
  37. return '공인-DB서버'
  38. elif asset_field == 'ASSETS_VAL_6':
  39. return '공인-패치서버'
  40. elif asset_field == 'ASSETS_VAL_7':
  41. return '공인-네트워크'
  42. elif asset_field == 'ASSETS_VAL_8':
  43. return '공인-보안'
  44. elif asset_field == 'ASSETS_VAL_9':
  45. return '공인-업무용PC'
  46. elif asset_field == 'ASSETS_VAL_10':
  47. return '공인-비업무용PC'
  48. elif asset_field == 'ASSETS_VAL_11':
  49. return '공인-기타'
  50. elif asset_field == 'ASSETS_VAL_12':
  51. return '사설-전체IP대역(유선)'
  52. elif asset_field == 'ASSETS_VAL_13':
  53. return '사설-전체IP대역(무선)'
  54. elif asset_field == 'ASSETS_VAL_14':
  55. return '사설-WEB서버'
  56. elif asset_field == 'ASSETS_VAL_15':
  57. return '사설-내부응용서버'
  58. elif asset_field == 'ASSETS_VAL_16':
  59. return '사설-DB서버'
  60. elif asset_field == 'ASSETS_VAL_17':
  61. return '사설-패치서버'
  62. elif asset_field == 'ASSETS_VAL_18':
  63. return '사설-네트워크'
  64. elif asset_field == 'ASSETS_VAL_19':
  65. return '사설-보안'
  66. elif asset_field == 'ASSETS_VAL_20':
  67. return '사설-업무용PC'
  68. elif asset_field == 'ASSETS_VAL_21':
  69. return '사설-비업무용PC'
  70. elif asset_field == 'ASSETS_VAL_22':
  71. return '사설-기타'
  72. else:
  73. return ''
  74. def get_intent_desc(intent_field):
  75. if intent_field == 'INTENT_VAL_1':
  76. return '파괴'
  77. elif intent_field == 'INTENT_VAL_2':
  78. return '유출'
  79. elif intent_field == 'INTENT_VAL_3':
  80. return '지연'
  81. elif intent_field == 'INTENT_VAL_4':
  82. return '잠복'
  83. elif intent_field == 'INTENT_VAL_5':
  84. return '단순침입'
  85. elif intent_field == 'INTENT_VAL_6':
  86. return 'MD5'
  87. elif intent_field == 'INTENT_VAL_0':
  88. return 'Default'
  89. else:
  90. return ''
  91. def get_source_desc(source_field):
  92. if source_field=='SOURCE_VAL_1':
  93. return '북한IP'
  94. if source_field=='SOURCE_VAL_3':
  95. return 'ECSC Black IP'
  96. else:
  97. return ''
  98. # New assets column
  99. ## ASSETS_VAL을 아예 JSON항목으로 만들어서 새로운 데이터프레임으로 생성.
  100. risk_df = pd.DataFrame()
  101. for risk in RISK_V2_FILTERED:
  102. risk = risk.replace("'", "\"") #json으로 만들려고.
  103. json_string = json.loads(risk)
  104. json_df = json_normalize(json_string)
  105. risk_df = pd.concat([risk_df,json_df],ignore_index=True) #DataFrame 합쳐주기. ignore_index = True를 해야 index가 재구성 된다.
  106. risk_df_column_names = risk_df.columns
  107. assets_df = []
  108. intents_df = []
  109. sources_df = []
  110. def filter_all(risk):
  111. for i in range(0,len(risk)):
  112. risks=[]
  113. intents=[]
  114. sources=[]
  115. for column in risk_df_column_names:
  116. # filter_asset
  117. if 'ASSETS_VAL_' in column and risk.iloc[i][column]:
  118. risk_key_desc = 'RISK_V2.' + column + '=' + get_asset_desc(column)
  119. risks.append(risk_key_desc)
  120. # filter_intent
  121. if 'INTENT_VAL_' in column and risk.iloc[i][column]:
  122. intent_key_desc = 'RISK_V2.' + column + '=' + get_intent_desc(column)
  123. intents.append(intent_key_desc)
  124. if 'SOURCE_VAL_' in column and risk.iloc[i][column]:
  125. source_key_desc='RISK_V2.' + column + '=' + get_source_desc(column)
  126. sources.append(source_key_desc)
  127. assets_df.append(risks)
  128. intents_df.append(intents)
  129. sources_df.append(sources)
  130. filter_all(risk_df)
  131. ## 여기까지 내가 만든 것.
  132. # In[3]:
  133. NTM_df['ASSETS_VAL'] = assets_df
  134. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].astype(str)
  135. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].str.replace('[','',regex=True)
  136. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].str.replace(']','',regex=True)
  137. NTM_df['ASSETS_VAL']
  138. # In[4]:
  139. NTM_df['INTENT_VAL'] = intents_df
  140. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].astype(str)
  141. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].str.replace('[','',regex=True)
  142. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].str.replace(']','',regex=True)
  143. NTM_df['INTENT_VAL']
  144. # In[5]:
  145. NTM_df['SOURCE_VAL'] = sources_df
  146. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].astype(str)
  147. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].str.replace('[','',regex=True)
  148. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].str.replace(']','',regex=True)
  149. NTM_df['SOURCE_VAL']
  150. # In[8]:
  151. NTM_df.drop(columns=['RISK_V2'], inplace=True)
  152. # In[12]:
  153. ##################### 여기서부터 진행하시면 됩니다. #####################
  154. ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################
  155. # It should be 13 columns in total
  156. # 1. 기관 INST_NM
  157. # 2. 공격 DRULE_ATT_TYPE_CODE1
  158. # 3. 자산 ASSETS_VAL
  159. # 4. 위협공격ip TW_ATT_IP
  160. # 5. 위협공격port TW_ATT_PORT
  161. # 6. 위협피해ip TW_DMG_IP
  162. # 7. 위협피해port TW_DMG_PORT
  163. # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM
  164. # 9. 공격국가 TW_ATT_CT_NM
  165. # 10. 의도(7개) INTENT_VAL
  166. # 11. IP/URL 가중치 SOURCE_VAL
  167. # 12. 장비 ACCD_FIND_MTD_CODE
  168. # 13. 탐지규칙명 DRULE_NM
  169. NTM_df.isna().sum()
  170. # Change the Nan to zero
  171. NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  172. NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')
  173. NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  174. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)
  175. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)
  176. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)
  177. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)
  178. NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  179. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)
  180. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)
  181. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)
  182. NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')
  183. # Check NaN out again
  184. NTM_df.isna().sum()
  185. copy_df = NTM_df.copy()
  186. copy_df.drop(columns=['ACCD_FIND_MTD_CODE'],inplace=True)
  187. copy_df.columns=['item1','item2','item3','item4','item5','item6','item7','item8','item9','item10','item11','item12']
  188. data_len = len(NTM_df)
  189. hwan_list = []
  190. # Combination
  191. import itertools
  192. arr = ['item1','item2','item3','item4','item5','item6','item7','item8','item9','item10','item11']
  193. nCr = list(itertools.combinations(arr,6))
  194. # item들은 이 순서다.
  195. #item1 = 'INST_NM'
  196. #item2 = 'DRULE_ATT_TYPE_CODE1'
  197. #item3 = 'TW_ATT_IP'
  198. #item4 = 'TW_ATT_PORT'
  199. #item5 = 'TW_DMG_IP'
  200. #item6 = 'TW_DMG_PORT'
  201. #item7 = 'ACCD_DMG_PROTO_NM'
  202. #item8 = 'TW_ATT_CT_NM'
  203. #item9 = 'DRULE_NM'
  204. #item10 = 'ASSETS_VAL'
  205. #item11 = 'INTENT_VAL'
  206. #item12 = 'SOURCE_VAL'
  207. for i in range(0,data_len):
  208. # item들은 이 순서다.
  209. item1 = NTM_df.loc[i]['INST_NM']
  210. item2 = NTM_df.loc[i]['DRULE_ATT_TYPE_CODE1']
  211. item3 = NTM_df.loc[i]['TW_ATT_IP']
  212. item4 = NTM_df.loc[i]['TW_ATT_PORT']
  213. item5 = NTM_df.loc[i]['TW_DMG_IP']
  214. item6 = NTM_df.loc[i]['TW_DMG_PORT']
  215. item7 = NTM_df.loc[i]['ACCD_DMG_PROTO_NM']
  216. item8 = NTM_df.loc[i]['TW_ATT_CT_NM']
  217. item9 = NTM_df.loc[i]['DRULE_NM']
  218. item10 = NTM_df.loc[i]['ASSETS_VAL']
  219. item11 = NTM_df.loc[i]['INTENT_VAL']
  220. item12 = NTM_df.loc[i]['SOURCE_VAL']
  221. not_null_arr = []
  222. ## 리스트안에 빈 값을 빼버리자.
  223. null_check_list = [item1,item2,item3,item4,item5,item6,item7,item8,item9,item10,item11,item12]
  224. for item in null_check_list:
  225. if item and item != '[]':
  226. not_null_arr.append(item)
  227. hwan_list.append(not_null_arr)
  228. new_ps = PrefixSpan(hwan_list)
  229. copy_df
  230. # In[23]:
  231. comlist = []
  232. for n in range(0,3):
  233. for i in range(0,data_len):
  234. itemlist = []
  235. locdata = copy_df.iloc[i]
  236. for item in nCr[n]:
  237. itemlist.append(locdata[item])
  238. comlist.append(itemlist)
  239. comlist #아이템들의 조합. nCr을 한 아이템들의 조합들. 이걸로 순서를 찾아보자.
  240. # In[25]:
  241. ## 여기도 내 코드
  242. test_ntm = new_ps.frequent(1,filter = lambda patt, matches:len(patt)>5)
  243. test_ntm_df = pd.DataFrame(test_ntm)
  244. test_ntm_df.rename(columns={0:'Frequency', 1:'Cause'}, inplace=True)
  245. # Make the new column for filling the Effect
  246. test_ntm_df['Effect']=np.nan
  247. # Change the order of columns
  248. test_ntm_df=test_ntm_df[['Cause','Effect','Frequency']]
  249. test_sort_values = test_ntm_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True)
  250. ##
  251. # In[26]:
  252. prefix_NTM_df = test_sort_values.copy()
  253. prefix_NTM_df
  254. # In[ ]:
  255. # Define the function that find the rule name
  256. # 데이터 크기를 줄여서 실행해본 결과 정상 작동함.
  257. for i in range(0,len(prefix_NTM_df)):
  258. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  259. loc_value = prefix_NTM_df.loc[i]
  260. for item in prefix_NTM_df.loc[i,'Cause']:
  261. for drule in drules:
  262. if item == drule:
  263. prefix_NTM_df.loc[i,'Effect'] = drule
  264. break
  265. # In[27]:
  266. prefix_NTM_df['Cause'] = [','.join(map(str, word))for word in prefix_NTM_df['Cause']]
  267. # Cause Column을 하나의 string으로 변환.
  268. # In[ ]:
  269. # 정규표현식 사용해서 매칭하기.
  270. # 정규표현식 사용하는 틀. words에 배열만 넣으면 된다.
  271. def regbase(words):
  272. base = r'^{}'
  273. expr = '(?=.*{})'
  274. ret = base.format(''.join(expr.format(w) for w in words))
  275. return ret
  276. for i in range(0,20):
  277. print(comlist[i])
  278. print(prefix_NTM_df[prefix_NTM_df['Cause'].str.contains(regbase(comlist[i]),na=False,regex=True)])
  279. # In[ ]: