파이썬 기반의 Prefix span 분석_fork
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PrefixSpan_20211021.ipynb 47KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "<p>NTM(유해트래픽 탐지장비)</p>\n",
  8. "<p>MTM(악성파일 탐지장비)</p>"
  9. ]
  10. },
  11. {
  12. "cell_type": "code",
  13. "execution_count": 1,
  14. "metadata": {},
  15. "outputs": [
  16. {
  17. "name": "stdout",
  18. "output_type": "stream",
  19. "text": [
  20. "10000\n",
  21. "10000\n"
  22. ]
  23. }
  24. ],
  25. "source": [
  26. "#!/usr/bin/env python\n",
  27. "# coding: utf-8\n",
  28. "\n",
  29. "import pandas as pd\n",
  30. "import numpy as np\n",
  31. "from mlxtend.preprocessing import TransactionEncoder\n",
  32. "from mlxtend.frequent_patterns import association_rules, fpgrowth\n",
  33. "from prefixspan import PrefixSpan\n",
  34. "\n",
  35. "# load ts_data_accident-2020_sample.csv\n",
  36. "# to prevent dtypewarning, set low_memory=False\n",
  37. "df = pd.read_csv('ts_data_accident-2020_sample.csv', low_memory=False)\n",
  38. "df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()\n",
  39. "len(df) #len(df) : 10000, load successful\n",
  40. "\n",
  41. "##################### NTM section #####################\n",
  42. "NTM_df=df[df['ACCD_FIND_MTD_CODE']==1] #* edit'1' to 1\n",
  43. "len(NTM_df)\n",
  44. "#* NTM_df.head()\n",
  45. "\n",
  46. "# Pick out it in order to get the asset, risk, intent, black IP out\n",
  47. "RISK_V2=NTM_df['RISK_V2']\n",
  48. "\n",
  49. "RISK_V2_FILTERED=RISK_V2.dropna()\n",
  50. "print(RISK_V2.size)\n",
  51. "print(RISK_V2_FILTERED.size)\n",
  52. "\n",
  53. "#* 추가 : 기존 filter_assets_value 사용시 값을 인식하지 못하는 문제 발생 -> RISK_V2를 별도의 df로 수정\n",
  54. "import json\n",
  55. "from pandas import json_normalize\n",
  56. "risk_df = pd.DataFrame()\n",
  57. "for newVal in RISK_V2_FILTERED:\n",
  58. " newVal = newVal.replace(\"'\", \"\\\"\")\n",
  59. " newVal_str = json.loads(newVal)\n",
  60. " newVal_df = json_normalize(newVal_str) \n",
  61. " risk_df = pd.concat([risk_df,newVal_df],ignore_index=True) \n",
  62. " \n",
  63. "risk_df_col = risk_df.columns.values.tolist()\n",
  64. "\n",
  65. "# In[352]:\n",
  66. "asset_val = []\n",
  67. "intent_val=[]\n",
  68. "source_val=[]\n",
  69. "def filter_assets_value(risk):\n",
  70. " for i in range(len(risk)):\n",
  71. " risks=[]\n",
  72. " intents=[]\n",
  73. " sources=[]\n",
  74. " try:\n",
  75. " for key in risk_df_col:\n",
  76. " if 'ASSETS_VAL_' in key and risk.iloc[i][key]:\n",
  77. " risk_key_desc = 'RISK_V2.' + key + '=' + get_asset_desc(key)\n",
  78. " risks.append(risk_key_desc)\n",
  79. " if 'INTENT_VAL_' in key and risk.iloc[i][key]:\n",
  80. " intent_key_desc = 'RISK_V2.' + key + '=' + get_intent_desc(key)\n",
  81. " intents.append(intent_key_desc)\n",
  82. " if 'SOURCE_VAL_' in key and risk.iloc[i][key]:\n",
  83. " source_key_desc='RISK_V2.' + key + '=' + get_source_desc(key)\n",
  84. " sources.append(source_key_desc)\n",
  85. " except:\n",
  86. " print(risk)\n",
  87. " print(type(risk))\n",
  88. " finally:\n",
  89. " asset_val.append(risks)\n",
  90. " intent_val.append(intents)\n",
  91. " source_val.append(sources)\n",
  92. " \n",
  93. " \n",
  94. "# modified\n",
  95. "def get_asset_desc(asset_field):\n",
  96. " if asset_field == 'ASSETS_VAL_1':\n",
  97. " return '공인-전체IP대역(유선)'\n",
  98. " elif asset_field == 'ASSETS_VAL_2':\n",
  99. " return '공인-전체IP대역(무선)'\n",
  100. " elif asset_field == 'ASSETS_VAL_3':\n",
  101. " return '공인-WEB서버'\n",
  102. " elif asset_field == 'ASSETS_VAL_4':\n",
  103. " return '공인-내부응용서버'\n",
  104. " elif asset_field == 'ASSETS_VAL_5':\n",
  105. " return '공인-DB서버'\n",
  106. " elif asset_field == 'ASSETS_VAL_6':\n",
  107. " return '공인-패치서버'\n",
  108. " elif asset_field == 'ASSETS_VAL_7':\n",
  109. " return '공인-네트워크'\n",
  110. " elif asset_field == 'ASSETS_VAL_8':\n",
  111. " return '공인-보안'\n",
  112. " elif asset_field == 'ASSETS_VAL_9':\n",
  113. " return '공인-업무용PC'\n",
  114. " elif asset_field == 'ASSETS_VAL_10':\n",
  115. " return '공인-비업무용PC'\n",
  116. " elif asset_field == 'ASSETS_VAL_11':\n",
  117. " return '공인-기타'\n",
  118. " elif asset_field == 'ASSETS_VAL_12':\n",
  119. " return '사설-전체IP대역(유선)'\n",
  120. " elif asset_field == 'ASSETS_VAL_13':\n",
  121. " return '사설-전체IP대역(무선)'\n",
  122. " elif asset_field == 'ASSETS_VAL_14':\n",
  123. " return '사설-WEB서버'\n",
  124. " elif asset_field == 'ASSETS_VAL_15':\n",
  125. " return '사설-내부응용서버'\n",
  126. " elif asset_field == 'ASSETS_VAL_16':\n",
  127. " return '사설-DB서버'\n",
  128. " elif asset_field == 'ASSETS_VAL_17':\n",
  129. " return '사설-패치서버'\n",
  130. " elif asset_field == 'ASSETS_VAL_18':\n",
  131. " return '사설-네트워크'\n",
  132. " elif asset_field == 'ASSETS_VAL_19':\n",
  133. " return '사설-보안'\n",
  134. " elif asset_field == 'ASSETS_VAL_20':\n",
  135. " return '사설-업무용PC'\n",
  136. " elif asset_field == 'ASSETS_VAL_21':\n",
  137. " return '사설-비업무용PC'\n",
  138. " elif asset_field == 'ASSETS_VAL_22':\n",
  139. " return '사설-기타'\n",
  140. " else:\n",
  141. " return ''\n",
  142. "\n",
  143. "\n",
  144. "\n",
  145. "# modified\n",
  146. "def filter_intent(intent):\n",
  147. " intents=[]\n",
  148. " for intent_key in intent:\n",
  149. " if 'INTENT_VAL_' in intent_key and intent[intent_key]:\n",
  150. " intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)\n",
  151. " intents.append(intent_key_desc)\n",
  152. " return intents\n",
  153. "\n",
  154. "\n",
  155. "# In[356]:\n",
  156. "\n",
  157. "\n",
  158. "def get_intent_desc(intent_field):\n",
  159. " if intent_field == 'INTENT_VAL_1':\n",
  160. " return '파괴'\n",
  161. " elif intent_field == 'INTENT_VAL_2':\n",
  162. " return '유출'\n",
  163. " elif intent_field == 'INTENT_VAL_3':\n",
  164. " return '지연'\n",
  165. " elif intent_field == 'INTENT_VAL_4':\n",
  166. " return '잠복'\n",
  167. " elif intent_field == 'INTENT_VAL_5':\n",
  168. " return '단순침입'\n",
  169. " elif intent_field == 'INTENT_VAL_6':\n",
  170. " return 'MD5'\n",
  171. " elif intent_field == 'INTENT_VAL_0':\n",
  172. " return 'Default'\n",
  173. " else:\n",
  174. " return ''\n",
  175. "\n",
  176. "\n",
  177. "# In[358]:\n",
  178. "\n",
  179. "\n",
  180. "# modified\n",
  181. "def filter_source(source):\n",
  182. " sources=[]\n",
  183. " for source_key in source:\n",
  184. " if 'SOURCE_VAL_' in source_key and source[source_key]:\n",
  185. " source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)\n",
  186. " sources.append(source_key_desc)\n",
  187. " return sources\n",
  188. "\n",
  189. "\n",
  190. "# In[359]:\n",
  191. "\n",
  192. "\n",
  193. "def get_source_desc(source_field):\n",
  194. " if source_field=='SOURCE_VAL_1':\n",
  195. " return '북한IP'\n",
  196. " if source_field=='SOURCE_VAL_3':\n",
  197. " return 'ECSC Black IP'\n",
  198. " else:\n",
  199. " return ''\n",
  200. "\n",
  201. "\n"
  202. ]
  203. },
  204. {
  205. "cell_type": "code",
  206. "execution_count": 2,
  207. "metadata": {},
  208. "outputs": [
  209. {
  210. "data": {
  211. "text/plain": [
  212. "Index(['INST_NM', 'DRULE_ATT_TYPE_CODE1', 'TW_ATT_IP', 'TW_ATT_PORT',\n",
  213. " 'TW_DMG_IP', 'TW_DMG_PORT', 'ACCD_DMG_PROTO_NM', 'TW_ATT_CT_NM',\n",
  214. " 'ACCD_FIND_MTD_CODE', 'DRULE_NM', 'ASSETS_VAL', 'INTENT_VAL',\n",
  215. " 'SOURCE_VAL'],\n",
  216. " dtype='object')"
  217. ]
  218. },
  219. "execution_count": 2,
  220. "metadata": {},
  221. "output_type": "execute_result"
  222. }
  223. ],
  224. "source": [
  225. "filter_assets_value(risk_df)\n",
  226. "#뒤에 isna()를 통해 na값을 0으로 바꿔주는 작업을 하므로, 값이 비어있는 경우 [] 대신 비워두기\n",
  227. "# New assets column\n",
  228. "NTM_df['ASSETS_VAL']= asset_val\n",
  229. "NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].astype(str)\n",
  230. "NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].str.replace('[','', regex=False)\n",
  231. "NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].str.replace(']','', regex=False)\n",
  232. "NTM_df[:1]\n",
  233. "# New column of intent value\n",
  234. "NTM_df['INTENT_VAL']=intent_val\n",
  235. "NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].astype(str)\n",
  236. "NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].str.replace('[','',regex=False)\n",
  237. "NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].str.replace(']','',regex=False)\n",
  238. "NTM_df[:1]\n",
  239. "# New column of SOURCE_VAL value\n",
  240. "NTM_df['SOURCE_VAL']=source_val\n",
  241. "NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str)\n",
  242. "NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace('[','',regex=False)\n",
  243. "NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace(']','',regex=False)\n",
  244. "NTM_df[:5]\n",
  245. "\n",
  246. "# In[361]:\n",
  247. "NTM_df.drop(columns=['RISK_V2'], inplace=True)\n",
  248. "NTM_df.columns"
  249. ]
  250. },
  251. {
  252. "cell_type": "code",
  253. "execution_count": 3,
  254. "metadata": {},
  255. "outputs": [],
  256. "source": [
  257. "#data frame의 i번째 row를 list로 저장하여 itertools.combinations로 모든 조합 만들 예정\n",
  258. "#TW_ATT_IP와 TW_DMG_IP의 값이 같은 경우 어떤 값과의 관계인지 알 수 없으므로 데이터 가공\n",
  259. "NTM_df['TW_ATT_IP']=\"TW_ATT_IP=\"+NTM_df['TW_ATT_IP'].astype(str)\n",
  260. "NTM_df['TW_ATT_PORT']=\"TW_ATT_PORT=\"+NTM_df['TW_ATT_PORT'].astype(str)\n",
  261. "NTM_df['TW_DMG_IP']=\"TW_DMG_IP=\"+NTM_df['TW_DMG_IP'].astype(str)\n",
  262. "NTM_df['TW_DMG_PORT']=\"TW_DMG_PORT=\"+NTM_df['TW_DMG_PORT'].astype(str)"
  263. ]
  264. },
  265. {
  266. "cell_type": "code",
  267. "execution_count": 4,
  268. "metadata": {},
  269. "outputs": [
  270. {
  271. "data": {
  272. "text/plain": [
  273. "INST_NM 0\n",
  274. "DRULE_ATT_TYPE_CODE1 0\n",
  275. "TW_ATT_IP 0\n",
  276. "TW_ATT_PORT 0\n",
  277. "TW_DMG_IP 0\n",
  278. "TW_DMG_PORT 0\n",
  279. "ACCD_DMG_PROTO_NM 0\n",
  280. "TW_ATT_CT_NM 0\n",
  281. "ACCD_FIND_MTD_CODE 0\n",
  282. "DRULE_NM 0\n",
  283. "ASSETS_VAL 0\n",
  284. "INTENT_VAL 0\n",
  285. "SOURCE_VAL 0\n",
  286. "dtype: int64"
  287. ]
  288. },
  289. "execution_count": 4,
  290. "metadata": {},
  291. "output_type": "execute_result"
  292. }
  293. ],
  294. "source": [
  295. "##################### 여기서부터 진행하시면 됩니다. #####################\n",
  296. "##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################\n",
  297. "\n",
  298. "# It should be 13 columns in total\n",
  299. "\n",
  300. "# 1. 기관 INST_NM\n",
  301. "# 2. 공격 DRULE_ATT_TYPE_CODE1\n",
  302. "# 3. 자산 ASSETS_VAL\n",
  303. "# 4. 위협공격ip TW_ATT_IP\n",
  304. "# 5. 위협공격port TW_ATT_PORT\n",
  305. "# 6. 위협피해ip TW_DMG_IP\n",
  306. "# 7. 위협피해port TW_DMG_PORT\n",
  307. "# 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM\n",
  308. "# 9. 공격국가 TW_ATT_CT_NM\n",
  309. "# 10. 의도(7개) INTENT_VAL\n",
  310. "# 11. IP/URL 가중치 SOURCE_VAL\n",
  311. "# 12. 장비 ACCD_FIND_MTD_CODE\n",
  312. "# 13. 탐지규칙명 DRULE_NM\n",
  313. "\n",
  314. "\n",
  315. "# In[363]:\n",
  316. "NTM_df.isna().sum()\n",
  317. "\n",
  318. "\n",
  319. "# Change the Nan to zero\n",
  320. "NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')\n",
  321. "NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')\n",
  322. "NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')\n",
  323. "NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)\n",
  324. "NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)\n",
  325. "NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)\n",
  326. "NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)\n",
  327. "NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')\n",
  328. "NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)\n",
  329. "NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)\n",
  330. "NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)\n",
  331. "NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')\n",
  332. "\n",
  333. "\n",
  334. "# Check NaN out again\n",
  335. "NTM_df.isna().sum()\n"
  336. ]
  337. },
  338. {
  339. "cell_type": "code",
  340. "execution_count": 5,
  341. "metadata": {},
  342. "outputs": [],
  343. "source": [
  344. "# NTM_df의 col을 list로 저장. itertools.combinations로 가능한 시나리오 모두 추출\n",
  345. "\n",
  346. "# ACCD_FIND_MTD_CODE col 지우기\n",
  347. "NTM_df.drop(columns=['ACCD_FIND_MTD_CODE'], inplace=True)"
  348. ]
  349. },
  350. {
  351. "cell_type": "code",
  352. "execution_count": 6,
  353. "metadata": {},
  354. "outputs": [],
  355. "source": [
  356. "from prefixspan import PrefixSpan\n",
  357. "import itertools\n",
  358. "# arr를 매개변수로 받아 n개의 아이템의 조합 반환\n",
  359. "def get_combination(arr, n):\n",
  360. " combination_n = list(itertools.combinations(arr.columns.tolist(),n))\n",
  361. " combination_n = [com for com in combination_n if 'DRULE_ATT_TYPE_CODE1' in com]\n",
  362. " com_list=[]\n",
  363. " # row i 의 (1,2),(1,3)... 이런식으로 하니까 시간 너무 오래걸림\n",
  364. " # (1,2) 조합에 대한 row i, row i+1, row i+2... 이렇게 바꿈\n",
  365. " for m in range(len(combination_n)):\n",
  366. " for i in range(len(arr)):\n",
  367. " temp_list=[]\n",
  368. " temp_df = arr.iloc[i]\n",
  369. " for col in combination_n[m]:\n",
  370. " # 공백 처리\n",
  371. " if(temp_df[col]==''):\n",
  372. " break\n",
  373. " else:\n",
  374. " temp_list.append(temp_df[col])\n",
  375. " com_list.append(temp_list)\n",
  376. " prefix = get_prefixspan(com_list)\n",
  377. " return prefix\n",
  378. "\n",
  379. "def get_prefixspan(load_list):\n",
  380. " n = len(load_list[0])\n",
  381. " save_list = PrefixSpan(load_list)\n",
  382. " #n개 아이템 조합으로 이루어졌는데 n보다 작은 갯수의 아이템으로 이루어진 prefixspan 결과 값 나옴 \n",
  383. " # 방지하기 위해 prefixspan의 결과값에는 'n개의 아이템의 값'이 다 들어가도록 filter 설정\n",
  384. " save_list = save_list.frequent(1,filter = lambda patt, matches:len(patt)>=n)\n",
  385. " save_df = pd.DataFrame(save_list)\n",
  386. " save_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)\n",
  387. " save_df = save_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True)\n",
  388. " save_df = get_effect(save_df)\n",
  389. " return save_df\n",
  390. "\n",
  391. "def get_effect(edit_df):\n",
  392. " #Make the new column for filling the Effect\n",
  393. " edit_df['Effect']=np.nan\n",
  394. " #Change the order of columns\n",
  395. " edit_df=edit_df[['Cause','Effect','Frequency']]\n",
  396. " for i in range(len(edit_df)):\n",
  397. " drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']\n",
  398. " temp_df = edit_df.loc[i]\n",
  399. " for item in temp_df['Cause']:\n",
  400. " for drule in drules:\n",
  401. " if item == drule:\n",
  402. " edit_df.loc[i,'Effect'] = item\n",
  403. " return edit_df\n"
  404. ]
  405. },
  406. {
  407. "cell_type": "code",
  408. "execution_count": 7,
  409. "metadata": {},
  410. "outputs": [
  411. {
  412. "data": {
  413. "text/html": [
  414. "<div>\n",
  415. "<style scoped>\n",
  416. " .dataframe tbody tr th:only-of-type {\n",
  417. " vertical-align: middle;\n",
  418. " }\n",
  419. "\n",
  420. " .dataframe tbody tr th {\n",
  421. " vertical-align: top;\n",
  422. " }\n",
  423. "\n",
  424. " .dataframe thead th {\n",
  425. " text-align: right;\n",
  426. " }\n",
  427. "</style>\n",
  428. "<table border=\"1\" class=\"dataframe\">\n",
  429. " <thead>\n",
  430. " <tr style=\"text-align: right;\">\n",
  431. " <th></th>\n",
  432. " <th>Cause</th>\n",
  433. " <th>Effect</th>\n",
  434. " <th>Frequency</th>\n",
  435. " </tr>\n",
  436. " </thead>\n",
  437. " <tbody>\n",
  438. " <tr>\n",
  439. " <th>0</th>\n",
  440. " <td>[Attack, 'RISK_V2.INTENT_VAL_5=단순침입']</td>\n",
  441. " <td>Attack</td>\n",
  442. " <td>7709</td>\n",
  443. " </tr>\n",
  444. " <tr>\n",
  445. " <th>1</th>\n",
  446. " <td>[Attack, 'RISK_V2.ASSETS_VAL_1=공인-전체IP대역(유선)']</td>\n",
  447. " <td>Attack</td>\n",
  448. " <td>3175</td>\n",
  449. " </tr>\n",
  450. " <tr>\n",
  451. " <th>2</th>\n",
  452. " <td>[Attack, Attack-Scan-29-01-PHPUnit(CVE17-9841)...</td>\n",
  453. " <td>Attack</td>\n",
  454. " <td>2770</td>\n",
  455. " </tr>\n",
  456. " <tr>\n",
  457. " <th>3</th>\n",
  458. " <td>[Attack, 중국]</td>\n",
  459. " <td>Attack</td>\n",
  460. " <td>2689</td>\n",
  461. " </tr>\n",
  462. " <tr>\n",
  463. " <th>4</th>\n",
  464. " <td>[Attack, 'RISK_V2.SOURCE_VAL_3=ECSC Black IP']</td>\n",
  465. " <td>Attack</td>\n",
  466. " <td>1904</td>\n",
  467. " </tr>\n",
  468. " <tr>\n",
  469. " <th>...</th>\n",
  470. " <td>...</td>\n",
  471. " <td>...</td>\n",
  472. " <td>...</td>\n",
  473. " </tr>\n",
  474. " <tr>\n",
  475. " <th>41145</th>\n",
  476. " <td>[Attack, TW_ATT_PORT=5389]</td>\n",
  477. " <td>Attack</td>\n",
  478. " <td>1</td>\n",
  479. " </tr>\n",
  480. " <tr>\n",
  481. " <th>41146</th>\n",
  482. " <td>[Attack, TW_ATT_PORT=38677]</td>\n",
  483. " <td>Attack</td>\n",
  484. " <td>1</td>\n",
  485. " </tr>\n",
  486. " <tr>\n",
  487. " <th>41147</th>\n",
  488. " <td>[Attack, TW_ATT_PORT=8287]</td>\n",
  489. " <td>Attack</td>\n",
  490. " <td>1</td>\n",
  491. " </tr>\n",
  492. " <tr>\n",
  493. " <th>41148</th>\n",
  494. " <td>[Attack, TW_ATT_PORT=2404]</td>\n",
  495. " <td>Attack</td>\n",
  496. " <td>1</td>\n",
  497. " </tr>\n",
  498. " <tr>\n",
  499. " <th>41149</th>\n",
  500. " <td>[Seoul Christian University, Malwr]</td>\n",
  501. " <td>Malwr</td>\n",
  502. " <td>1</td>\n",
  503. " </tr>\n",
  504. " </tbody>\n",
  505. "</table>\n",
  506. "<p>41150 rows × 3 columns</p>\n",
  507. "</div>"
  508. ],
  509. "text/plain": [
  510. " Cause Effect Frequency\n",
  511. "0 [Attack, 'RISK_V2.INTENT_VAL_5=단순침입'] Attack 7709\n",
  512. "1 [Attack, 'RISK_V2.ASSETS_VAL_1=공인-전체IP대역(유선)'] Attack 3175\n",
  513. "2 [Attack, Attack-Scan-29-01-PHPUnit(CVE17-9841)... Attack 2770\n",
  514. "3 [Attack, 중국] Attack 2689\n",
  515. "4 [Attack, 'RISK_V2.SOURCE_VAL_3=ECSC Black IP'] Attack 1904\n",
  516. "... ... ... ...\n",
  517. "41145 [Attack, TW_ATT_PORT=5389] Attack 1\n",
  518. "41146 [Attack, TW_ATT_PORT=38677] Attack 1\n",
  519. "41147 [Attack, TW_ATT_PORT=8287] Attack 1\n",
  520. "41148 [Attack, TW_ATT_PORT=2404] Attack 1\n",
  521. "41149 [Seoul Christian University, Malwr] Malwr 1\n",
  522. "\n",
  523. "[41150 rows x 3 columns]"
  524. ]
  525. },
  526. "execution_count": 7,
  527. "metadata": {},
  528. "output_type": "execute_result"
  529. }
  530. ],
  531. "source": [
  532. "# 1. 두 아이템의 조합\n",
  533. "item = 2\n",
  534. "prefix_of_two = get_combination(NTM_df, item)\n",
  535. "prefix_of_two"
  536. ]
  537. },
  538. {
  539. "cell_type": "code",
  540. "execution_count": 8,
  541. "metadata": {},
  542. "outputs": [
  543. {
  544. "ename": "KeyboardInterrupt",
  545. "evalue": "",
  546. "output_type": "error",
  547. "traceback": [
  548. "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
  549. "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
  550. "\u001b[1;32m<ipython-input-8-fdb1732ee6a2>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# 2. 세 아이템의 조합\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mprefix_of_three\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_combination\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mNTM_df\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m3\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
  551. "\u001b[1;32m<ipython-input-6-7cca23a52bd5>\u001b[0m in \u001b[0;36mget_combination\u001b[1;34m(arr, n)\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[0mtemp_list\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtemp_df\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[0mcom_list\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtemp_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 21\u001b[1;33m \u001b[0mprefix\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_prefixspan\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcom_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 22\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mprefix\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
  552. "\u001b[1;32m<ipython-input-6-7cca23a52bd5>\u001b[0m in \u001b[0;36mget_prefixspan\u001b[1;34m(load_list)\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[0msave_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;34m'Frequency'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;34m'Cause'\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0minplace\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[0msave_df\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msave_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mby\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Frequency'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mascending\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mignore_index\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 33\u001b[1;33m \u001b[0msave_df\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_effect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msave_df\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 34\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0msave_df\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
  553. "\u001b[1;32m<ipython-input-6-7cca23a52bd5>\u001b[0m in \u001b[0;36mget_effect\u001b[1;34m(edit_df)\u001b[0m\n\u001b[0;32m 45\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mdrule\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdrules\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mitem\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mdrule\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 47\u001b[1;33m \u001b[0medit_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'Effect'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mitem\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 48\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0medit_df\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  554. "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m__setitem__\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m 690\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 691\u001b[0m \u001b[0miloc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"iloc\"\u001b[0m \u001b[1;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 692\u001b[1;33m \u001b[0miloc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_setitem_with_indexer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 693\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 694\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_validate_key\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  555. "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_setitem_with_indexer\u001b[1;34m(self, indexer, value, name)\u001b[0m\n\u001b[0;32m 1633\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtake_split_path\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1634\u001b[0m \u001b[1;31m# We have to operate column-wise\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1635\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_setitem_with_indexer_split_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1636\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1637\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_setitem_single_block\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  556. "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_setitem_with_indexer_split_path\u001b[1;34m(self, indexer, value, name)\u001b[0m\n\u001b[0;32m 1718\u001b[0m \u001b[1;31m# scalar value\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1719\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mloc\u001b[0m \u001b[1;32min\u001b[0m \u001b[0milocs\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1720\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_setitem_single_column\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpi\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1721\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1722\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_setitem_with_indexer_2d_value\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  557. "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_setitem_single_column\u001b[1;34m(self, loc, value, plane_indexer)\u001b[0m\n\u001b[0;32m 1815\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1816\u001b[0m \u001b[1;31m# reset the sliced object if unique\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1817\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_iset_item\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mser\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1818\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1819\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_setitem_single_block\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  558. "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m_iset_item\u001b[1;34m(self, loc, value)\u001b[0m\n\u001b[0;32m 3220\u001b[0m \u001b[1;31m# technically _sanitize_column expects a label, not a position,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3221\u001b[0m \u001b[1;31m# but the behavior is the same as long as we pass broadcast=False\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3222\u001b[1;33m \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sanitize_column\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbroadcast\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3223\u001b[0m \u001b[0mNDFrame\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_iset_item\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mloc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3224\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
  559. "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m_sanitize_column\u001b[1;34m(self, key, value, broadcast)\u001b[0m\n\u001b[0;32m 3874\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3875\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSeries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3876\u001b[1;33m \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreindexer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3877\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3878\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  560. "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36mreindexer\u001b[1;34m(value)\u001b[0m\n\u001b[0;32m 3855\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3856\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mequals\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mor\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3857\u001b[1;33m \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3858\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3859\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
  561. "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
  562. ]
  563. }
  564. ],
  565. "source": [
  566. "# 2. 세 아이템의 조합\n",
  567. "prefix_of_three = get_combination(NTM_df, 3)"
  568. ]
  569. },
  570. {
  571. "cell_type": "code",
  572. "execution_count": null,
  573. "metadata": {},
  574. "outputs": [],
  575. "source": [
  576. "# 3. 네 아이템의 조합\n",
  577. "prefix_of_four = get_combination(NTM_df, 4)"
  578. ]
  579. },
  580. {
  581. "cell_type": "code",
  582. "execution_count": null,
  583. "metadata": {},
  584. "outputs": [],
  585. "source": [
  586. "# 4. 다섯 아이템의 조합\n",
  587. "prefix_of_five = get_combination(NTM_df, 5)"
  588. ]
  589. },
  590. {
  591. "cell_type": "code",
  592. "execution_count": null,
  593. "metadata": {},
  594. "outputs": [],
  595. "source": [
  596. "# 5. 여섯 아이템의 조합\n",
  597. "prefix_of_six = get_combination(NTM_df, 6)\n",
  598. "##################### NTM section End #####################"
  599. ]
  600. },
  601. {
  602. "cell_type": "code",
  603. "execution_count": null,
  604. "metadata": {},
  605. "outputs": [],
  606. "source": [
  607. "##################### MTM section #####################\n",
  608. "# Same goes for the MTM section\n",
  609. "\n",
  610. "# In[375]:\n",
  611. "\n",
  612. "\n",
  613. "MTM_df=df[df['ACCD_FIND_MTD_CODE']==2]\n",
  614. "len(MTM_df)\n",
  615. "\n",
  616. "\n",
  617. "# In[376]:\n",
  618. "\n",
  619. "\n",
  620. "# Pick out it in order to get the asset, risk, intent, black IP out\n",
  621. "RISK_V2_MTM=MTM_df['RISK_V2']\n",
  622. "\n",
  623. "RISK_V2_FILTERED_MTM=RISK_V2_MTM.dropna()\n",
  624. "print(RISK_V2_MTM.size)\n",
  625. "print(RISK_V2_FILTERED_MTM.size)\n",
  626. "\n",
  627. "risk_df_MTM = pd.DataFrame()\n",
  628. "for newVal_MTM in RISK_V2_FILTERED_MTM:\n",
  629. " newVal_MTM = newVal_MTM.replace(\"'\", \"\\\"\")\n",
  630. " newVal_MTM_str = json.loads(newVal_MTM)\n",
  631. " newVal_df_MTM = json_normalize(newVal_MTM_str) \n",
  632. " risk_df_MTM = pd.concat([risk_df_MTM,newVal_df_MTM],ignore_index=True) \n",
  633. " \n",
  634. "risk_df_col_MTM = risk_df_MTM.columns.values.tolist()\n",
  635. "\n",
  636. "# In[377]:\n",
  637. "\n",
  638. "\n",
  639. "asset_val_MTM = []\n",
  640. "intent_val_MTM=[]\n",
  641. "source_val_MTM=[]\n",
  642. "\n",
  643. "def filter_assets_value_MTM(risk):\n",
  644. " for i in range(len(risk)):\n",
  645. " risks=[]\n",
  646. " intents=[]\n",
  647. " sources=[]\n",
  648. " try:\n",
  649. " for key in risk_df_col:\n",
  650. " if 'ASSETS_VAL_' in key and risk.iloc[i][key]:\n",
  651. " risk_key_desc = 'RISK_V2.' + key + '=' + get_asset_desc(key)\n",
  652. " risks.append(risk_key_desc)\n",
  653. " if 'INTENT_VAL_' in key and risk.iloc[i][key]:\n",
  654. " intent_key_desc = 'RISK_V2.' + key + '=' + get_intent_desc(key)\n",
  655. " intents.append(intent_key_desc)\n",
  656. " if 'SOURCE_VAL_' in key and risk.iloc[i][key]:\n",
  657. " source_key_desc='RISK_V2.' + key + '=' + get_source_desc(key)\n",
  658. " sources.append(source_key_desc)\n",
  659. " except:\n",
  660. " print(risk)\n",
  661. " print(type(risk))\n",
  662. " finally:\n",
  663. " asset_val_MTM.append(risks)\n",
  664. " intent_val_MTM.append(intents)\n",
  665. " source_val_MTM.append(sources)\n",
  666. "\n",
  667. "# In[378]:\n",
  668. "\n",
  669. "# modified\n",
  670. "def get_asset_desc_MTM(asset_field):\n",
  671. " if asset_field == 'ASSETS_VAL_1':\n",
  672. " return '공인-전체IP대역(유선)'\n",
  673. " elif asset_field == 'ASSETS_VAL_2':\n",
  674. " return '공인-전체IP대역(무선)'\n",
  675. " elif asset_field == 'ASSETS_VAL_3':\n",
  676. " return '공인-WEB서버'\n",
  677. " elif asset_field == 'ASSETS_VAL_4':\n",
  678. " return '공인-내부응용서버'\n",
  679. " elif asset_field == 'ASSETS_VAL_5':\n",
  680. " return '공인-DB서버'\n",
  681. " elif asset_field == 'ASSETS_VAL_6':\n",
  682. " return '공인-패치서버'\n",
  683. " elif asset_field == 'ASSETS_VAL_7':\n",
  684. " return '공인-네트워크'\n",
  685. " elif asset_field == 'ASSETS_VAL_8':\n",
  686. " return '공인-보안'\n",
  687. " elif asset_field == 'ASSETS_VAL_9':\n",
  688. " return '공인-업무용PC'\n",
  689. " elif asset_field == 'ASSETS_VAL_10':\n",
  690. " return '공인-비업무용PC'\n",
  691. " elif asset_field == 'ASSETS_VAL_11':\n",
  692. " return '공인-기타'\n",
  693. " elif asset_field == 'ASSETS_VAL_12':\n",
  694. " return '사설-전체IP대역(유선)'\n",
  695. " elif asset_field == 'ASSETS_VAL_13':\n",
  696. " return '사설-전체IP대역(무선)'\n",
  697. " elif asset_field == 'ASSETS_VAL_14':\n",
  698. " return '사설-WEB서버'\n",
  699. " elif asset_field == 'ASSETS_VAL_15':\n",
  700. " return '사설-내부응용서버'\n",
  701. " elif asset_field == 'ASSETS_VAL_16':\n",
  702. " return '사설-DB서버'\n",
  703. " elif asset_field == 'ASSETS_VAL_17':\n",
  704. " return '사설-패치서버'\n",
  705. " elif asset_field == 'ASSETS_VAL_18':\n",
  706. " return '사설-네트워크'\n",
  707. " elif asset_field == 'ASSETS_VAL_19':\n",
  708. " return '사설-보안'\n",
  709. " elif asset_field == 'ASSETS_VAL_20':\n",
  710. " return '사설-업무용PC'\n",
  711. " elif asset_field == 'ASSETS_VAL_21':\n",
  712. " return '사설-비업무용PC'\n",
  713. " elif asset_field == 'ASSETS_VAL_22':\n",
  714. " return '사설-기타'\n",
  715. " else:\n",
  716. " return ''\n",
  717. "\n",
  718. "\n",
  719. "# In[381]:\n",
  720. "\n",
  721. "\n",
  722. "# modified\n",
  723. "def filter_intent_MTM(intent):\n",
  724. " intents=[]\n",
  725. " for intent_key in intent:\n",
  726. " if 'INTENT_VAL_' in intent_key and intent[intent_key]:\n",
  727. " intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)\n",
  728. " intents.append(intent_key_desc)\n",
  729. " return intents\n",
  730. "\n",
  731. "\n",
  732. "# In[382]:\n",
  733. "\n",
  734. "\n",
  735. "def get_intent_desc_MTM(intent_field):\n",
  736. " if intent_field == 'INTENT_VAL_1':\n",
  737. " return '파괴'\n",
  738. " elif intent_field == 'INTENT_VAL_2':\n",
  739. " return '유출'\n",
  740. " elif intent_field == 'INTENT_VAL_3':\n",
  741. " return '지연'\n",
  742. " elif intent_field == 'INTENT_VAL_4':\n",
  743. " return '잠복'\n",
  744. " elif intent_field == 'INTENT_VAL_5':\n",
  745. " return '단순침입'\n",
  746. " elif intent_field == 'INTENT_VAL_6':\n",
  747. " return 'MD5'\n",
  748. " elif intent_field == 'INTENT_VAL_0':\n",
  749. " return 'Default'\n",
  750. " else:\n",
  751. " return ''\n",
  752. "\n",
  753. "\n",
  754. "\n",
  755. "# In[384]:\n",
  756. "\n",
  757. "\n",
  758. "# modified\n",
  759. "def filter_source_MTM(source):\n",
  760. " sources=[]\n",
  761. " for source_key in source:\n",
  762. " if 'SOURCE_VAL_' in source_key and source[source_key]:\n",
  763. " source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)\n",
  764. " sources.append(source_key_desc)\n",
  765. " return sources\n",
  766. "\n",
  767. "\n",
  768. "# In[385]:\n",
  769. "\n",
  770. "\n",
  771. "def get_source_desc_MTM(source_field):\n",
  772. " if source_field=='SOURCE_VAL_1':\n",
  773. " return '북한IP'\n",
  774. " if source_field=='SOURCE_VAL_3':\n",
  775. " return 'ECSC Black IP'\n",
  776. " else:\n",
  777. " return ''\n",
  778. "\n",
  779. "\n",
  780. "# In[386]:\n",
  781. "\n",
  782. "filter_assets_value(risk_df_MTM)\n",
  783. "#뒤에 isna()를 통해 na값을 0으로 바꿔주는 작업을 하므로, 값이 비어있는 경우 [] 대신 비워두기\n",
  784. "# New assets column\n",
  785. "MTM_df['ASSETS_VAL']= asset_val_MTM\n",
  786. "MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].astype(str)\n",
  787. "MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].str.replace('[','', regex=False)\n",
  788. "MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].str.replace(']','', regex=False)\n",
  789. "MTM_df[:1]\n",
  790. "# New column of intent value\n",
  791. "MTM_df['INTENT_VAL']=intent_val_MTM\n",
  792. "MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].astype(str)\n",
  793. "MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].str.replace('[','',regex=False)\n",
  794. "MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].str.replace(']','',regex=False)\n",
  795. "MTM_df[:1]\n",
  796. "# New column of SOURCE_VAL value\n",
  797. "MTM_df['SOURCE_VAL']=source_val_MTM\n",
  798. "MTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str)\n",
  799. "MTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace('[','',regex=False)\n",
  800. "MTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace(']','',regex=False)\n",
  801. "MTM_df[:5]\n",
  802. "\n",
  803. "# In[361]:\n",
  804. "MTM_df.drop(columns=['RISK_V2'], inplace=True)\n",
  805. "MTM_df.columns\n",
  806. "\n",
  807. "\n",
  808. "# In[388]:\n",
  809. "\n",
  810. "\n",
  811. "MTM_df.isna().sum()\n",
  812. "\n",
  813. "\n",
  814. "# In[389]:\n",
  815. "\n",
  816. "\n",
  817. "# Change the Nan to zero\n",
  818. "MTM_df['ACCD_DMG_PROTO_NM']=MTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')\n",
  819. "MTM_df['INST_NM']=MTM_df['INST_NM'].replace(np.nan,'')\n",
  820. "MTM_df['DRULE_ATT_TYPE_CODE1']=MTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')\n",
  821. "MTM_df['TW_ATT_IP']=MTM_df['TW_ATT_IP'].replace(np.nan,0)\n",
  822. "MTM_df['TW_ATT_PORT']=MTM_df['TW_ATT_PORT'].replace(np.nan,0)\n",
  823. "MTM_df['TW_DMG_IP']=MTM_df['TW_DMG_IP'].replace(np.nan,0)\n",
  824. "MTM_df['TW_DMG_PORT']=MTM_df['TW_DMG_PORT'].replace(np.nan,0)\n",
  825. "MTM_df['TW_ATT_CT_NM']=MTM_df['TW_ATT_CT_NM'].replace(np.nan,'')\n",
  826. "MTM_df['ASSETS_VAL']=MTM_df['ASSETS_VAL'].replace(np.nan,0)\n",
  827. "MTM_df['INTENT_VAL']=MTM_df['INTENT_VAL'].replace(np.nan,0)\n",
  828. "MTM_df['SOURCE_VAL']=MTM_df['SOURCE_VAL'].replace(np.nan,0)\n",
  829. "MTM_df['DRULE_NM']=MTM_df['DRULE_NM'].replace(np.nan,'')\n",
  830. "\n",
  831. "\n",
  832. "# In[390]:\n",
  833. "\n",
  834. "\n",
  835. "# Check NaN out again\n",
  836. "MTM_df.isna().sum()\n",
  837. "\n",
  838. "\n",
  839. "# In[391]:\n",
  840. "\n",
  841. "# ACCD_FIND_MTD_CODE col 지우기\n",
  842. "MTM_df.drop(columns=['ACCD_FIND_MTD_CODE'], inplace=True)\n",
  843. "\n",
  844. "# arr를 매개변수로 받아 n개의 아이템의 조합 반환\n",
  845. "def get_combination_MTM(arr, n):\n",
  846. " combination_n = list(itertools.combinations(arr.columns.tolist(),n))\n",
  847. " combination_n = [com for com in combination_n if 'DRULE_ATT_TYPE_CODE1' in com]\n",
  848. " com_list=[]\n",
  849. " for m in range(len(combination_n)):\n",
  850. " for i in range(len(arr)):\n",
  851. " temp_list=[]\n",
  852. " temp_df = arr.iloc[i]\n",
  853. " for col in combination_n[m]:\n",
  854. " # 공백 처리\n",
  855. " if(temp_df[col]==''):\n",
  856. " break\n",
  857. " else:\n",
  858. " temp_list.append(temp_df[col])\n",
  859. " com_list.append(temp_list)\n",
  860. " prefix = get_prefixspan_MTM(com_list)\n",
  861. " return prefix\n",
  862. "\n",
  863. "def get_prefixspan_MTM(load_list):\n",
  864. " n = len(load_list[0])\n",
  865. " save_list = PrefixSpan(load_list)\n",
  866. " #n개 아이템 조합으로 이루어졌는데 n보다 작은 갯수의 아이템으로 이루어진 prefixspan 결과 값 나옴 \n",
  867. " # 방지하기 위해 prefixspan의 결과값에는 'n개의 아이템의 값'이 다 들어가도록 filter 설정\n",
  868. " save_list = save_list.frequent(1,filter = lambda patt, matches:len(patt)>=n)\n",
  869. " save_df = pd.DataFrame(save_list)\n",
  870. " save_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)\n",
  871. " save_df = save_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True)\n",
  872. " save_df = get_effect_MTM(save_df)\n",
  873. " return save_df\n",
  874. "\n",
  875. "def get_effect_MTM(edit_df):\n",
  876. " #Make the new column for filling the Effect\n",
  877. " edit_df['Effect']=np.nan\n",
  878. " #Change the order of columns\n",
  879. " edit_df=edit_df[['Cause','Effect','Frequency']]\n",
  880. " for i in range(len(edit_df)):\n",
  881. " drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']\n",
  882. " temp_df = edit_df.loc[i]\n",
  883. " for item in temp_df['Cause']:\n",
  884. " for drule in drules:\n",
  885. " if item == drule:\n",
  886. " edit_df.loc[i,'Effect'] = item\n",
  887. " return edit_df\n",
  888. "\n",
  889. "\n",
  890. "\n",
  891. "# 1. 두 아이템의 조합\n",
  892. "prefix_of_two_MTM = get_combination(MTM_df,2)\n",
  893. "\n",
  894. "# 2. 세 아이템의 조합\n",
  895. "prefix_of_three_MTM = get_combination(MTM_df, 3)\n",
  896. "\n",
  897. "# 3. 네 아이템의 조합\n",
  898. "prefix_of_four_MTM = get_combination(MTM_df, 4)\n",
  899. "\n",
  900. "# 4. 다섯 아이템의 조합\n",
  901. "prefix_of_five_MTM = get_combination(MTM_df, 5)\n",
  902. "\n",
  903. "\n",
  904. "# 5. 여섯 아이템의 조합\n",
  905. "prefix_of_six_MTM = get_combination(MTM_df, 6)\n",
  906. "\n",
  907. "##################### MTM section End #####################"
  908. ]
  909. }
  910. ],
  911. "metadata": {
  912. "anaconda-cloud": {},
  913. "kernelspec": {
  914. "display_name": "Python 3",
  915. "language": "python",
  916. "name": "python3"
  917. },
  918. "language_info": {
  919. "codemirror_mode": {
  920. "name": "ipython",
  921. "version": 3
  922. },
  923. "file_extension": ".py",
  924. "mimetype": "text/x-python",
  925. "name": "python",
  926. "nbconvert_exporter": "python",
  927. "pygments_lexer": "ipython3",
  928. "version": "3.8.8"
  929. }
  930. },
  931. "nbformat": 4,
  932. "nbformat_minor": 4
  933. }