파이썬 기반의 Prefix span 분석
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

prefix_1025.py 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # In[1]:
  4. import pandas as pd
  5. import numpy as np
  6. from mlxtend.preprocessing import TransactionEncoder
  7. from mlxtend.frequent_patterns import association_rules, fpgrowth
  8. from prefixspan import PrefixSpan
  9. df = pd.read_csv("ts_data_accident-2020_sample.csv", low_memory=False, encoding='ISO-8859-1')
  10. pd.set_option('display.max_columns',None)
  11. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()
  12. len(df)
  13. ##################### NTM section #####################
  14. NTM_df=df[df['ACCD_FIND_MTD_CODE']==1]
  15. NTM_df
  16. # Pick out it in order to get the asset, risk, intent, black IP out
  17. RISK_V2=NTM_df['RISK_V2']
  18. RISK_V2_FILTERED=RISK_V2.dropna()
  19. ## 결측값 제거.
  20. import json
  21. from pandas import json_normalize
  22. # modified
  23. def get_asset_desc(asset_field):
  24. if asset_field == 'ASSETS_VAL_1':
  25. return '공인-전체IP대역(유선)'
  26. elif asset_field == 'ASSETS_VAL_2':
  27. return '공인-전체IP대역(무선)'
  28. elif asset_field == 'ASSETS_VAL_3':
  29. return '공인-WEB서버'
  30. elif asset_field == 'ASSETS_VAL_4':
  31. return '공인-내부응용서버'
  32. elif asset_field == 'ASSETS_VAL_5':
  33. return '공인-DB서버'
  34. elif asset_field == 'ASSETS_VAL_6':
  35. return '공인-패치서버'
  36. elif asset_field == 'ASSETS_VAL_7':
  37. return '공인-네트워크'
  38. elif asset_field == 'ASSETS_VAL_8':
  39. return '공인-보안'
  40. elif asset_field == 'ASSETS_VAL_9':
  41. return '공인-업무용PC'
  42. elif asset_field == 'ASSETS_VAL_10':
  43. return '공인-비업무용PC'
  44. elif asset_field == 'ASSETS_VAL_11':
  45. return '공인-기타'
  46. elif asset_field == 'ASSETS_VAL_12':
  47. return '사설-전체IP대역(유선)'
  48. elif asset_field == 'ASSETS_VAL_13':
  49. return '사설-전체IP대역(무선)'
  50. elif asset_field == 'ASSETS_VAL_14':
  51. return '사설-WEB서버'
  52. elif asset_field == 'ASSETS_VAL_15':
  53. return '사설-내부응용서버'
  54. elif asset_field == 'ASSETS_VAL_16':
  55. return '사설-DB서버'
  56. elif asset_field == 'ASSETS_VAL_17':
  57. return '사설-패치서버'
  58. elif asset_field == 'ASSETS_VAL_18':
  59. return '사설-네트워크'
  60. elif asset_field == 'ASSETS_VAL_19':
  61. return '사설-보안'
  62. elif asset_field == 'ASSETS_VAL_20':
  63. return '사설-업무용PC'
  64. elif asset_field == 'ASSETS_VAL_21':
  65. return '사설-비업무용PC'
  66. elif asset_field == 'ASSETS_VAL_22':
  67. return '사설-기타'
  68. else:
  69. return ''
  70. def get_intent_desc(intent_field):
  71. if intent_field == 'INTENT_VAL_1':
  72. return '파괴'
  73. elif intent_field == 'INTENT_VAL_2':
  74. return '유출'
  75. elif intent_field == 'INTENT_VAL_3':
  76. return '지연'
  77. elif intent_field == 'INTENT_VAL_4':
  78. return '잠복'
  79. elif intent_field == 'INTENT_VAL_5':
  80. return '단순침입'
  81. elif intent_field == 'INTENT_VAL_6':
  82. return 'MD5'
  83. elif intent_field == 'INTENT_VAL_0':
  84. return 'Default'
  85. else:
  86. return ''
  87. def get_source_desc(source_field):
  88. if source_field=='SOURCE_VAL_1':
  89. return '북한IP'
  90. if source_field=='SOURCE_VAL_3':
  91. return 'ECSC Black IP'
  92. else:
  93. return ''
  94. # New assets column
  95. ## ASSETS_VAL을 아예 JSON항목으로 만들어서 새로운 데이터프레임으로 생성.
  96. risk_df = pd.DataFrame()
  97. for risk in RISK_V2_FILTERED:
  98. risk = risk.replace("'", "\"") #json으로 만들려고.
  99. json_string = json.loads(risk)
  100. json_df = json_normalize(json_string)
  101. risk_df = pd.concat([risk_df,json_df],ignore_index=True) #DataFrame 합쳐주기. ignore_index = True를 해야 index가 재구성 된다.
  102. risk_df_column_names = risk_df.columns
  103. assets_df = []
  104. intents_df = []
  105. sources_df = []
  106. def filter_all(risk):
  107. for i in range(0,len(risk)):
  108. risks=[]
  109. intents=[]
  110. sources=[]
  111. for column in risk_df_column_names:
  112. # filter_asset
  113. if 'ASSETS_VAL_' in column and risk.iloc[i][column]:
  114. risk_key_desc = 'RISK_V2.' + column + '=' + get_asset_desc(column)
  115. risks.append(risk_key_desc)
  116. # filter_intent
  117. if 'INTENT_VAL_' in column and risk.iloc[i][column]:
  118. intent_key_desc = 'RISK_V2.' + column + '=' + get_intent_desc(column)
  119. intents.append(intent_key_desc)
  120. if 'SOURCE_VAL_' in column and risk.iloc[i][column]:
  121. source_key_desc='RISK_V2.' + column + '=' + get_source_desc(column)
  122. sources.append(source_key_desc)
  123. assets_df.append(risks)
  124. intents_df.append(intents)
  125. sources_df.append(sources)
  126. filter_all(risk_df)
  127. ## 여기까지 내가 만든 것.
  128. NTM_df['ASSETS_VAL'] = assets_df
  129. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].astype(str)
  130. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].str.replace('[','',regex=True)
  131. NTM_df['ASSETS_VAL'] = NTM_df['ASSETS_VAL'].str.replace(']','',regex=True)
  132. NTM_df['ASSETS_VAL']
  133. NTM_df['INTENT_VAL'] = intents_df
  134. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].astype(str)
  135. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].str.replace('[','',regex=True)
  136. NTM_df['INTENT_VAL'] = NTM_df['INTENT_VAL'].str.replace(']','',regex=True)
  137. NTM_df['INTENT_VAL']
  138. NTM_df['SOURCE_VAL'] = sources_df
  139. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].astype(str)
  140. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].str.replace('[','',regex=True)
  141. NTM_df['SOURCE_VAL'] = NTM_df['SOURCE_VAL'].str.replace(']','',regex=True)
  142. NTM_df['SOURCE_VAL']
  143. NTM_df.drop(columns=['RISK_V2'], inplace=True)
  144. ##################### 여기서부터 진행하시면 됩니다. #####################
  145. ##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################
  146. # It should be 13 columns in total
  147. # 1. 기관 INST_NM
  148. # 2. 공격 DRULE_ATT_TYPE_CODE1
  149. # 3. 자산 ASSETS_VAL
  150. # 4. 위협공격ip TW_ATT_IP
  151. # 5. 위협공격port TW_ATT_PORT
  152. # 6. 위협피해ip TW_DMG_IP
  153. # 7. 위협피해port TW_DMG_PORT
  154. # 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM
  155. # 9. 공격국가 TW_ATT_CT_NM
  156. # 10. 의도(7개) INTENT_VAL
  157. # 11. IP/URL 가중치 SOURCE_VAL
  158. # 12. 장비 ACCD_FIND_MTD_CODE
  159. # 13. 탐지규칙명 DRULE_NM
  160. NTM_df.isna().sum()
  161. # Change the Nan to zero
  162. NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')
  163. NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')
  164. NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')
  165. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)
  166. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)
  167. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)
  168. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)
  169. NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')
  170. NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)
  171. NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)
  172. NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)
  173. NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')
  174. NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP']
  175. NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT']
  176. NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP']
  177. NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT']
  178. # Check NaN out again
  179. NTM_df.isna().sum()
  180. copy_df = NTM_df.copy() #원본도 안건드리고, 실행시킬 때마다 오류 떠서 copy로 하는게 좋을 것 같다.
  181. copy_df.drop(columns=['ACCD_FIND_MTD_CODE'],inplace=True)
  182. data_len = len(NTM_df)
  183. # Combination
  184. import itertools
  185. # Combination 조합들 생성하는 함수. row마다 mCn 생성.
  186. def get_comb_df(df, n):
  187. nCr = list(itertools.combinations(df.columns.tolist(),n))
  188. nCr = [column for column in nCr if 'DRULE_ATT_TYPE_CODE1' in column]
  189. ret_list = []
  190. for l in range(len(nCr)):
  191. for i in range(len(df)):
  192. temp = []
  193. temp_df = df.loc[i]
  194. for col in nCr[l]:
  195. new_string = col
  196. new_string = new_string + ":" + str(temp_df[col])
  197. temp.append(new_string)
  198. ret_list.append(temp)
  199. return ret_list
  200. # item들은 이 순서다.
  201. #item1 = 'INST_NM'
  202. #item2 = 'DRULE_ATT_TYPE_CODE1'
  203. #item3 = 'TW_ATT_IP'
  204. #item4 = 'TW_ATT_PORT'
  205. #item5 = 'TW_DMG_IP'
  206. #item6 = 'TW_DMG_PORT'
  207. #item7 = 'ACCD_DMG_PROTO_NM'
  208. #item8 = 'TW_ATT_CT_NM'
  209. #item9 = 'DRULE_NM'
  210. #item10 = 'ASSETS_VAL'
  211. #item11 = 'INTENT_VAL'
  212. #item12 = 'SOURCE_VAL'
  213. nonnull_list = []
  214. for i in range(0,data_len):
  215. item1 = 'INST_NM:' + NTM_df.loc[i]['INST_NM']
  216. item2 = 'DRULE_ATT_TYPE_CODE1:' + NTM_df.loc[i]['DRULE_ATT_TYPE_CODE1']
  217. item3 = 'TW_ATT_IP:' + NTM_df.loc[i]['TW_ATT_IP'].astype(str)
  218. item4 = 'TW_ATT_PORT:' + NTM_df.loc[i]['TW_ATT_PORT'].astype(str)
  219. item5 = 'TW_DMG_IP:' + NTM_df.loc[i]['TW_DMG_IP'].astype(str)
  220. item6 = 'TW_DMG_PORT:' + NTM_df.loc[i]['TW_DMG_PORT'].astype(str)
  221. item7 = 'ACCD_DMG_PROTO_NM:' + NTM_df.loc[i]['ACCD_DMG_PROTO_NM']
  222. item8 = 'TW_ATT_CT_NM:' + NTM_df.loc[i]['TW_ATT_CT_NM']
  223. item9 = 'DRULE_NM:' + NTM_df.loc[i]['DRULE_NM']
  224. item10 = NTM_df.loc[i]['ASSETS_VAL']
  225. item11 = NTM_df.loc[i]['INTENT_VAL']
  226. item12 = NTM_df.loc[i]['SOURCE_VAL']
  227. not_null_arr = []
  228. ## 리스트안에 빈 값을 빼버리자.
  229. null_check_list = [item1,item2,item3,item4,item5,item6,item7,item8,item9,item10,item11,item12]
  230. for item in null_check_list:
  231. if item and item != '[]':
  232. not_null_arr.append(item)
  233. nonnull_list.append(not_null_arr)
  234. get_comb_df(copy_df,9)
  235. def get_prefix_span(df_list, n): #n이상 길이를 갖는 규칙들만. 거기다가 Frequency기준 정렬 까지.
  236. prefix_span = PrefixSpan(df_list)
  237. n_ps = prefix_span.frequent(1,filter = lambda patt, matches:len(patt)>n)
  238. ps_df = pd.DataFrame(n_ps)
  239. ps_df.rename(columns={0:'Frequency', 1:'Cause'}, inplace=True)
  240. ps_df['Effect']= np.nan
  241. ps_df = ps_df[['Cause','Effect','Frequency']]
  242. ps_sort_df = ps_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True)
  243. return ps_sort_df
  244. test = get_prefix_span(nonnull_list,8)
  245. test
  246. # Define the function that find the rule name
  247. # 데이터 크기를 줄여서 실행해본 결과 정상 작동함.
  248. def get_Effect(df):
  249. for i in range(0,10000):
  250. drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']
  251. for item in df.loc[i,'Cause']:
  252. for drule in drules:
  253. drule_str = 'DRULE_ATT_TYPE_CODE1:' + drule
  254. if item == drule_str:
  255. df.loc[i,'Effect'] = drule
  256. break
  257. return df
  258. tdf = get_Effect(test)
  259. tdf.head(10000) # 10000개로 했을 때, DRULE_ATT_TYPE_CODE 가 있는 항목들은 Effect정상 추출.
  260. tdf = get_Effect(test)
  261. testdf = tdf.tail(1000)
  262. testdf
  263. ### Effect = NaN인 값 지우기.
  264. testdf = tdf.head(1000)
  265. test_result = testdf
  266. testdf
  267. test_result['Effect'] = test_result['Effect'].replace(np.nan,0)
  268. test_result = test_result[test_result.Effect != 0]
  269. test_result.reset_index(drop=True)
  270. # 정규표현식 사용해서 매칭하기.
  271. # 정규표현식 사용하는 틀. words에 배열만 넣으면 된다.
  272. tdf['Cause'] = [','.join(map(str, word))for word in tdf['Cause']]
  273. def regbase(words):
  274. base = r'^{}'
  275. expr = '(?=.*{})'
  276. ret = base.format(''.join(expr.format(w) for w in words))
  277. return ret
  278. def result(n):
  279. comlist = get_comb_df(copy_df,n)
  280. for i in range(0,len(comlist)):
  281. print(comlist[i])
  282. print(tdf[tdf['Cause'].str.contains(regbase(comlist[i]),na=False,regex=True)].reset_index(drop=True,inplace=False))