파이썬 기반의 Prefix span 분석
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PrefixSpan_20210925.py 4.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # In[1]:
  4. import os
  5. import array
  6. import math
  7. import pickle
  8. # import joblib
  9. import sys
  10. import argparse
  11. import pandas as pd
  12. import numpy as np
  13. import matplotlib.pyplot as plt
  14. from datetime import datetime
  15. from pprint import pprint
  16. import ssl
  17. from elasticsearch.connection import create_ssl_context
  18. from elasticsearch import Elasticsearch
  19. from elasticsearch import helpers
  20. import urllib3
  21. # In[3]:
  22. import pandas as pd
  23. import numpy as np
  24. from mlxtend.preprocessing import TransactionEncoder
  25. from mlxtend.frequent_patterns import association_rules, fpgrowth
  26. from prefixspan import PrefixSpan
  27. # In[4]:
  28. ssl_context = create_ssl_context()
  29. ssl_context.check_hostname = False
  30. ssl_context.verify_mode = ssl.CERT_NONE
  31. # In[12]:
  32. es = Elasticsearch(hosts=[{'host': '223.194.92.152', 'port': 9200}], scheme="http",verify_certs=False, timeout=300, ssl_context=ssl_context, http_auth=("elasticsearch", "hadoop2019@!@#$"))
  33. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  34. # In[135]:
  35. ######## 2020, 1 year ########
  36. body = {
  37. "size" : 100,
  38. "query": {
  39. "range":{
  40. "TW_COLLECT_DT":{
  41. "gte":"2020-01-01T00:00:00.625+09:00",
  42. "lte":"2020-12-31T00:00:00.625+09:00" ################
  43. }
  44. }
  45. }
  46. }
  47. res = es.search(index = 'ts_data_accident-2020', body=body)
  48. data = res['hits']['hits']
  49. total = res['hits']['total']
  50. print(total)
  51. accident = []
  52. for da in data:
  53. att_type = da['_source']
  54. # att_type["POL_NM"]=att_type["SCEN_INFOS"][0]["POL_NM"]
  55. accident.append(att_type)
  56. # df = pd.DataFrame(accident,dtype=str)
  57. df = pd.DataFrame(accident)
  58. print(df.head())
  59. # In[136]:
  60. df=df[['RISK_V2','INST_NM','DRULE_ATT_TYPE_CODE1','TW_ATT_IP','TW_ATT_PORT','TW_DMG_IP','TW_DMG_PORT','ACCD_DMG_PROTO_NM','TW_ATT_CT_NM','ACCD_FIND_MTD_CODE']]
  61. df.head()
  62. # In[248]:
  63. # import ast
  64. # Pick out it in order to get the asset, risk, intent, black IP out
  65. RISK_V2=df['RISK_V2']
  66. # risk_values=RISK_V2.values
  67. # print(risk_values)
  68. # print(type(risk_value[0]))
  69. # risk_v2_zero=RISK_V2[0]
  70. # print(RISK_V2.values[:2])
  71. # dict_risk_v2=ast.literal_eval(RISK_V2[0])
  72. # print(dict[0])
  73. # In[229]:
  74. def filter_assets_value(risk):
  75. risks=[]
  76. try:
  77. for risk_key in risk:
  78. if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  79. risks.append(risk_key)
  80. except:
  81. print(risk)
  82. print(type(risk))
  83. finally:
  84. return risks
  85. # In[106]:
  86. # # modified
  87. # def filter_assets_value(risk):
  88. # risks=[]
  89. # for risk_key in risk:
  90. # if 'ASSETS_VAL_' in risk_key and risk[risk_key]:
  91. # risk_key_desc = 'RISK_V2.' + risk_key + '=' + get_asset_desc(risk_key)
  92. # risks.append(risk_key_desc)
  93. # return risks
  94. # In[115]:
  95. # modified
  96. def get_asset_desc(asset_field):
  97. if asset_field == 'ASSETS_VAL_1':
  98. return '공인-전체IP대역(유선)'
  99. elif asset_field == 'ASSETS_VAL_2':
  100. return '공인-전체IP대역(무선)'
  101. elif asset_field == 'ASSETS_VAL_3':
  102. return '공인-WEB서버'
  103. elif asset_field == 'ASSETS_VAL_4':
  104. return '공인-내부응용서버'
  105. elif asset_field == 'ASSETS_VAL_5':
  106. return '공인-DB서버'
  107. elif asset_field == 'ASSETS_VAL_6':
  108. return '공인-패치서버'
  109. elif asset_field == 'ASSETS_VAL_7':
  110. return '공인-네트워크'
  111. elif asset_field == 'ASSETS_VAL_8':
  112. return '공인-보안'
  113. elif asset_field == 'ASSETS_VAL_9':
  114. return '공인-업무용PC'
  115. elif asset_field == 'ASSETS_VAL_10':
  116. return '공인-비업무용PC'
  117. elif asset_field == 'ASSETS_VAL_11':
  118. return '공인-기타'
  119. elif asset_field == 'ASSETS_VAL_12':
  120. return '사설-전체IP대역(유선)'
  121. elif asset_field == 'ASSETS_VAL_13':
  122. return '사설-전체IP대역(무선)'
  123. elif asset_field == 'ASSETS_VAL_14':
  124. return '사설-WEB서버'
  125. elif asset_field == 'ASSETS_VAL_15':
  126. return '사설-내부응용서버'
  127. elif asset_field == 'ASSETS_VAL_16':
  128. return '사설-DB서버'
  129. elif asset_field == 'ASSETS_VAL_17':
  130. return '사설-패치서버'
  131. elif asset_field == 'ASSETS_VAL_18':
  132. return '사설-네트워크'
  133. elif asset_field == 'ASSETS_VAL_19':
  134. return '사설-보안'
  135. elif asset_field == 'ASSETS_VAL_20':
  136. return '사설-업무용PC'
  137. elif asset_field == 'ASSETS_VAL_21':
  138. return '사설-비업무용PC'
  139. elif asset_field == 'ASSETS_VAL_22':
  140. return '사설-기타'
  141. else:
  142. return ''
  143. # In[250]:
  144. # New assets column
  145. x=list(map(filter_assets_value, RISK_V2))
  146. # print(list(filter(lambda n:n!='None',df['ASSETS_VAL'])))
  147. len(x)
  148. # In[ ]: