{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "
NTM(유해트래픽 탐지장비)
\n", "MTM(악성파일 탐지장비)
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10000\n", "10000\n" ] } ], "source": [ "#!/usr/bin/env python\n", "# coding: utf-8\n", "\n", "import pandas as pd\n", "import numpy as np\n", "from mlxtend.preprocessing import TransactionEncoder\n", "from mlxtend.frequent_patterns import association_rules, fpgrowth\n", "from prefixspan import PrefixSpan\n", "\n", "# load ts_data_accident-2020_sample.csv\n", "# to prevent dtypewarning, set low_memory=False\n", "df = pd.read_csv('ts_data_accident-2020_sample.csv', low_memory=False)\n", "df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE','DRULE_NM']].dropna()\n", "len(df) #len(df) : 10000, load successful\n", "\n", "##################### NTM section #####################\n", "NTM_df=df[df['ACCD_FIND_MTD_CODE']==1] #* edit'1' to 1\n", "len(NTM_df)\n", "#* NTM_df.head()\n", "\n", "# Pick out it in order to get the asset, risk, intent, black IP out\n", "RISK_V2=NTM_df['RISK_V2']\n", "\n", "RISK_V2_FILTERED=RISK_V2.dropna()\n", "print(RISK_V2.size)\n", "print(RISK_V2_FILTERED.size)\n", "\n", "#* 추가 : 기존 filter_assets_value 사용시 값을 인식하지 못하는 문제 발생 -> RISK_V2를 별도의 df로 수정\n", "import json\n", "from pandas import json_normalize\n", "risk_df = pd.DataFrame()\n", "for newVal in RISK_V2_FILTERED:\n", " newVal = newVal.replace(\"'\", \"\\\"\")\n", " newVal_str = json.loads(newVal)\n", " newVal_df = json_normalize(newVal_str) \n", " risk_df = pd.concat([risk_df,newVal_df],ignore_index=True) \n", " \n", "risk_df_col = risk_df.columns.values.tolist()\n", "\n", "# In[352]:\n", "asset_val = []\n", "intent_val=[]\n", "source_val=[]\n", "def filter_assets_value(risk):\n", " for i in range(len(risk)):\n", " risks=[]\n", " intents=[]\n", " sources=[]\n", " try:\n", " for key in risk_df_col:\n", " if 'ASSETS_VAL_' in key and risk.iloc[i][key]:\n", " risk_key_desc = 'RISK_V2.' + key + '=' + get_asset_desc(key)\n", " risks.append(risk_key_desc)\n", " if 'INTENT_VAL_' in key and risk.iloc[i][key]:\n", " intent_key_desc = 'RISK_V2.' + key + '=' + get_intent_desc(key)\n", " intents.append(intent_key_desc)\n", " if 'SOURCE_VAL_' in key and risk.iloc[i][key]:\n", " source_key_desc='RISK_V2.' + key + '=' + get_source_desc(key)\n", " sources.append(source_key_desc)\n", " except:\n", " print(risk)\n", " print(type(risk))\n", " finally:\n", " asset_val.append(risks)\n", " intent_val.append(intents)\n", " source_val.append(sources)\n", " \n", " \n", "# modified\n", "def get_asset_desc(asset_field):\n", " if asset_field == 'ASSETS_VAL_1':\n", " return '공인-전체IP대역(유선)'\n", " elif asset_field == 'ASSETS_VAL_2':\n", " return '공인-전체IP대역(무선)'\n", " elif asset_field == 'ASSETS_VAL_3':\n", " return '공인-WEB서버'\n", " elif asset_field == 'ASSETS_VAL_4':\n", " return '공인-내부응용서버'\n", " elif asset_field == 'ASSETS_VAL_5':\n", " return '공인-DB서버'\n", " elif asset_field == 'ASSETS_VAL_6':\n", " return '공인-패치서버'\n", " elif asset_field == 'ASSETS_VAL_7':\n", " return '공인-네트워크'\n", " elif asset_field == 'ASSETS_VAL_8':\n", " return '공인-보안'\n", " elif asset_field == 'ASSETS_VAL_9':\n", " return '공인-업무용PC'\n", " elif asset_field == 'ASSETS_VAL_10':\n", " return '공인-비업무용PC'\n", " elif asset_field == 'ASSETS_VAL_11':\n", " return '공인-기타'\n", " elif asset_field == 'ASSETS_VAL_12':\n", " return '사설-전체IP대역(유선)'\n", " elif asset_field == 'ASSETS_VAL_13':\n", " return '사설-전체IP대역(무선)'\n", " elif asset_field == 'ASSETS_VAL_14':\n", " return '사설-WEB서버'\n", " elif asset_field == 'ASSETS_VAL_15':\n", " return '사설-내부응용서버'\n", " elif asset_field == 'ASSETS_VAL_16':\n", " return '사설-DB서버'\n", " elif asset_field == 'ASSETS_VAL_17':\n", " return '사설-패치서버'\n", " elif asset_field == 'ASSETS_VAL_18':\n", " return '사설-네트워크'\n", " elif asset_field == 'ASSETS_VAL_19':\n", " return '사설-보안'\n", " elif asset_field == 'ASSETS_VAL_20':\n", " return '사설-업무용PC'\n", " elif asset_field == 'ASSETS_VAL_21':\n", " return '사설-비업무용PC'\n", " elif asset_field == 'ASSETS_VAL_22':\n", " return '사설-기타'\n", " else:\n", " return ''\n", "\n", "\n", "\n", "# modified\n", "def filter_intent(intent):\n", " intents=[]\n", " for intent_key in intent:\n", " if 'INTENT_VAL_' in intent_key and intent[intent_key]:\n", " intent_key_desc = 'RISK_V2.' + intent_key + '=' + get_intent_desc(intent_key)\n", " intents.append(intent_key_desc)\n", " return intents\n", "\n", "\n", "# In[356]:\n", "\n", "\n", "def get_intent_desc(intent_field):\n", " if intent_field == 'INTENT_VAL_1':\n", " return '파괴'\n", " elif intent_field == 'INTENT_VAL_2':\n", " return '유출'\n", " elif intent_field == 'INTENT_VAL_3':\n", " return '지연'\n", " elif intent_field == 'INTENT_VAL_4':\n", " return '잠복'\n", " elif intent_field == 'INTENT_VAL_5':\n", " return '단순침입'\n", " elif intent_field == 'INTENT_VAL_6':\n", " return 'MD5'\n", " elif intent_field == 'INTENT_VAL_0':\n", " return 'Default'\n", " else:\n", " return ''\n", "\n", "\n", "# In[358]:\n", "\n", "\n", "# modified\n", "def filter_source(source):\n", " sources=[]\n", " for source_key in source:\n", " if 'SOURCE_VAL_' in source_key and source[source_key]:\n", " source_key_desc='RISK_V2.' + source_key + '=' + get_source_desc(source_key)\n", " sources.append(source_key_desc)\n", " return sources\n", "\n", "\n", "# In[359]:\n", "\n", "\n", "def get_source_desc(source_field):\n", " if source_field=='SOURCE_VAL_1':\n", " return '북한IP'\n", " if source_field=='SOURCE_VAL_3':\n", " return 'ECSC Black IP'\n", " else:\n", " return ''\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['INST_NM', 'DRULE_ATT_TYPE_CODE1', 'TW_ATT_IP', 'TW_ATT_PORT',\n", " 'TW_DMG_IP', 'TW_DMG_PORT', 'ACCD_DMG_PROTO_NM', 'TW_ATT_CT_NM',\n", " 'ACCD_FIND_MTD_CODE', 'DRULE_NM', 'ASSETS_VAL', 'INTENT_VAL',\n", " 'SOURCE_VAL'],\n", " dtype='object')" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filter_assets_value(risk_df)\n", "#뒤에 isna()를 통해 na값을 0으로 바꿔주는 작업을 하므로, 값이 비어있는 경우 [] 대신 비워두기\n", "# New assets column\n", "NTM_df['ASSETS_VAL']= asset_val\n", "NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].astype(str)\n", "NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].str.replace('[','', regex=False)\n", "NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].str.replace(']','', regex=False)\n", "NTM_df[:1]\n", "# New column of intent value\n", "NTM_df['INTENT_VAL']=intent_val\n", "NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].astype(str)\n", "NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].str.replace('[','',regex=False)\n", "NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].str.replace(']','',regex=False)\n", "NTM_df[:1]\n", "# New column of SOURCE_VAL value\n", "NTM_df['SOURCE_VAL']=source_val\n", "NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].astype(str)\n", "NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace('[','',regex=False)\n", "NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].str.replace(']','',regex=False)\n", "NTM_df[:5]\n", "\n", "# In[361]:\n", "NTM_df.drop(columns=['RISK_V2'], inplace=True)\n", "NTM_df.columns" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#data frame의 i번째 row를 list로 저장하여 itertools.combinations로 모든 조합 만들 예정\n", "#TW_ATT_IP와 TW_DMG_IP의 값이 같은 경우 어떤 값과의 관계인지 알 수 없으므로 데이터 가공\n", "NTM_df['TW_ATT_IP']=\"TW_ATT_IP=\"+NTM_df['TW_ATT_IP'].astype(str)\n", "NTM_df['TW_ATT_PORT']=\"TW_ATT_PORT=\"+NTM_df['TW_ATT_PORT'].astype(str)\n", "NTM_df['TW_DMG_IP']=\"TW_DMG_IP=\"+NTM_df['TW_DMG_IP'].astype(str)\n", "NTM_df['TW_DMG_PORT']=\"TW_DMG_PORT=\"+NTM_df['TW_DMG_PORT'].astype(str)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "INST_NM 0\n", "DRULE_ATT_TYPE_CODE1 0\n", "TW_ATT_IP 0\n", "TW_ATT_PORT 0\n", "TW_DMG_IP 0\n", "TW_DMG_PORT 0\n", "ACCD_DMG_PROTO_NM 0\n", "TW_ATT_CT_NM 0\n", "ACCD_FIND_MTD_CODE 0\n", "DRULE_NM 0\n", "ASSETS_VAL 0\n", "INTENT_VAL 0\n", "SOURCE_VAL 0\n", "dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##################### 여기서부터 진행하시면 됩니다. #####################\n", "##################### 아래 12개 아이템(12. 장비 ACCD_FIND_MTD_CODE 제외)에 대해서 모든 아이템 조합에 알고리즘 적용하기#####################\n", "\n", "# It should be 13 columns in total\n", "\n", "# 1. 기관 INST_NM\n", "# 2. 공격 DRULE_ATT_TYPE_CODE1\n", "# 3. 자산 ASSETS_VAL\n", "# 4. 위협공격ip TW_ATT_IP\n", "# 5. 위협공격port TW_ATT_PORT\n", "# 6. 위협피해ip TW_DMG_IP\n", "# 7. 위협피해port TW_DMG_PORT\n", "# 8. 위협피해프로토콜 ACCD_DMG_PROTO_NM\n", "# 9. 공격국가 TW_ATT_CT_NM\n", "# 10. 의도(7개) INTENT_VAL\n", "# 11. IP/URL 가중치 SOURCE_VAL\n", "# 12. 장비 ACCD_FIND_MTD_CODE\n", "# 13. 탐지규칙명 DRULE_NM\n", "\n", "\n", "# In[363]:\n", "NTM_df.isna().sum()\n", "\n", "\n", "# Change the Nan to zero\n", "NTM_df['ACCD_DMG_PROTO_NM']=NTM_df['ACCD_DMG_PROTO_NM'].replace(np.nan,'')\n", "NTM_df['INST_NM']=NTM_df['INST_NM'].replace(np.nan,'')\n", "NTM_df['DRULE_ATT_TYPE_CODE1']=NTM_df['DRULE_ATT_TYPE_CODE1'].replace(np.nan,'')\n", "NTM_df['TW_ATT_IP']=NTM_df['TW_ATT_IP'].replace(np.nan,0)\n", "NTM_df['TW_ATT_PORT']=NTM_df['TW_ATT_PORT'].replace(np.nan,0)\n", "NTM_df['TW_DMG_IP']=NTM_df['TW_DMG_IP'].replace(np.nan,0)\n", "NTM_df['TW_DMG_PORT']=NTM_df['TW_DMG_PORT'].replace(np.nan,0)\n", "NTM_df['TW_ATT_CT_NM']=NTM_df['TW_ATT_CT_NM'].replace(np.nan,'')\n", "NTM_df['ASSETS_VAL']=NTM_df['ASSETS_VAL'].replace(np.nan,0)\n", "NTM_df['INTENT_VAL']=NTM_df['INTENT_VAL'].replace(np.nan,0)\n", "NTM_df['SOURCE_VAL']=NTM_df['SOURCE_VAL'].replace(np.nan,0)\n", "NTM_df['DRULE_NM']=NTM_df['DRULE_NM'].replace(np.nan,'')\n", "\n", "\n", "# Check NaN out again\n", "NTM_df.isna().sum()\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# NTM_df의 col을 list로 저장. itertools.combinations로 가능한 시나리오 모두 추출\n", "\n", "# ACCD_FIND_MTD_CODE col 지우기\n", "NTM_df.drop(columns=['ACCD_FIND_MTD_CODE'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from prefixspan import PrefixSpan\n", "import itertools\n", "# arr를 매개변수로 받아 n개의 아이템의 조합 반환\n", "def get_combination(arr, n):\n", " combination_n = list(itertools.combinations(arr.columns.tolist(),n))\n", " combination_n = [com for com in combination_n if 'DRULE_ATT_TYPE_CODE1' in com]\n", " com_list=[]\n", " # row i 의 (1,2),(1,3)... 이런식으로 하니까 시간 너무 오래걸림\n", " # (1,2) 조합에 대한 row i, row i+1, row i+2... 이렇게 바꿈\n", " for m in range(len(combination_n)):\n", " for i in range(len(arr)):\n", " temp_list=[]\n", " temp_df = arr.iloc[i]\n", " for col in combination_n[m]:\n", " # 공백 처리\n", " if(temp_df[col]==''):\n", " break\n", " else:\n", " temp_list.append(temp_df[col])\n", " com_list.append(temp_list)\n", " prefix = get_prefixspan(com_list)\n", " return prefix\n", "\n", "def get_prefixspan(load_list):\n", " n = len(load_list[0])\n", " save_list = PrefixSpan(load_list)\n", " #n개 아이템 조합으로 이루어졌는데 n보다 작은 갯수의 아이템으로 이루어진 prefixspan 결과 값 나옴 \n", " # 방지하기 위해 prefixspan의 결과값에는 'n개의 아이템의 값'이 다 들어가도록 filter 설정\n", " save_list = save_list.frequent(1,filter = lambda patt, matches:len(patt)>=n)\n", " save_df = pd.DataFrame(save_list)\n", " save_df.rename(columns={0:'Frequency',1:'Cause'},inplace=True)\n", " save_df = save_df.sort_values(by=['Frequency'],ascending=False,ignore_index=True)\n", " save_df = get_effect(save_df)\n", " return save_df\n", "\n", "def get_effect(edit_df):\n", " #Make the new column for filling the Effect\n", " edit_df['Effect']=np.nan\n", " #Change the order of columns\n", " edit_df=edit_df[['Cause','Effect','Frequency']]\n", " for i in range(len(edit_df)):\n", " drules=['Attack','DDOS','HACK','MAIL','Malwr','WEB']\n", " temp_df = edit_df.loc[i]\n", " for item in temp_df['Cause']:\n", " for drule in drules:\n", " if item == drule:\n", " edit_df.loc[i,'Effect'] = item\n", " return edit_df\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "| \n", " | Cause | \n", "Effect | \n", "Frequency | \n", "
|---|---|---|---|
| 0 | \n", "[Attack, 'RISK_V2.INTENT_VAL_5=단순침입'] | \n", "Attack | \n", "7709 | \n", "
| 1 | \n", "[Attack, 'RISK_V2.ASSETS_VAL_1=공인-전체IP대역(유선)'] | \n", "Attack | \n", "3175 | \n", "
| 2 | \n", "[Attack, Attack-Scan-29-01-PHPUnit(CVE17-9841)... | \n", "Attack | \n", "2770 | \n", "
| 3 | \n", "[Attack, 중국] | \n", "Attack | \n", "2689 | \n", "
| 4 | \n", "[Attack, 'RISK_V2.SOURCE_VAL_3=ECSC Black IP'] | \n", "Attack | \n", "1904 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "
| 41145 | \n", "[Attack, TW_ATT_PORT=5389] | \n", "Attack | \n", "1 | \n", "
| 41146 | \n", "[Attack, TW_ATT_PORT=38677] | \n", "Attack | \n", "1 | \n", "
| 41147 | \n", "[Attack, TW_ATT_PORT=8287] | \n", "Attack | \n", "1 | \n", "
| 41148 | \n", "[Attack, TW_ATT_PORT=2404] | \n", "Attack | \n", "1 | \n", "
| 41149 | \n", "[Seoul Christian University, Malwr] | \n", "Malwr | \n", "1 | \n", "
41150 rows × 3 columns
\n", "