KNN模型家族Top10列表
测试集家族Top10列表
判定依据:
特别说明:
结论如下:
0.95错误样本抽样分析
0.90错误样本抽样分析
恶意家族标签
恶意样本数量
恶意样本占比
virlock
169631
26.83%
botnet
168238
26.61%
gandcrab
76101
12.04%
wabot
47340
7.49%
pluto
28554
4.52%
coinminer
19226
3.04%
autorun
17681
2.80%
virut
8012
1.27%
gandcrypt
7210
1.14%
upatre
6717
1.06%
总计
548710
86.79%
家族标签
样本数量
样本占比
clean
200
50.00%
gandcrypt
14
3.50%
upatre
13
3.25%
ipamor
11
2.75%
wabot
11
2.75%
pluto
9
2.25%
autoit
9
2.25%
virut
8
2.00%
allaple
8
2.00%
sytro
6
1.50%
总计
289
72.25%
import
sys
import
requests
from
python_mmdt.mmdt.common
import
mmdt_load
dlt
=
0.95
def
mmdt_scan_online_check():
file_name
=
sys.argv[
1
]
features
=
mmdt_load(file_name)
TP
=
0
TN
=
0
FP
=
0
FN
=
0
count
=
0
print
(
'检测结果,文件md5,真实标签,相似文件,预测标签,相似度'
)
for
feature
in
features:
count
+
=
1
tmp
=
feature.strip().split(
":"
)
file_mmdt
=
':'
.join(tmp[:
2
])
tag
=
tmp[
2
]
file_sha1
=
tmp[
3
]
data
=
{
"md5"
: file_sha1,
"sha1"
: file_sha1,
"file_name"
: file_sha1,
"mmdt"
: file_mmdt,
"data"
: {}
}
r
=
requests.post(url
=
'74eK9s2c8@1M7q4)9K6b7g2)9J5c8W2)9J5c8U0p5@1y4W2)9J5k6e0f1$3i4K6u0W2x3U0b7J5i4K6u0W2x3e0R3@1i4K6u0r3L8h3#2V1N6q4)9J5c8Y4y4U0j5h3&6Q4x3U0M7`.
, json
=
data)
r_data
=
r.json()
if
r_data.get(
'status'
,
0
)
=
=
20001
:
status
=
r_data.get(
'status'
,
0
)
message
=
r_data.get(
'message'
, '')
print
(
'文件md5: %s, 状态码: %d, 提交信息: %s'
%
(file_sha1, status, message))
else
:
label
=
r_data.get(
'data'
, {}).get(
'label'
,
'unknown'
)
sim_hash
=
r_data.get(
'data'
, {}).get(
'similars'
, [])[
0
].get(
'hash'
,
'None'
)
sim
=
r_data.get(
'data'
, {}).get(
'similars'
, [])[
0
].get(
'sim'
,
0.0
)
check_result
=
''
if
tag
=
=
label
and
sim > dlt:
TP
+
=
1
check_result
=
'正确'
elif
tag
=
=
'clean'
and
sim > dlt:
FP
+
=
1
check_result
=
'错误'
elif
tag
=
=
'clean'
and
sim <
=
dlt:
TN
+
=
1
check_result
=
'正确'
else
:
FN
+
=
1
check_result
=
'错误'
print
(
'%s,%s,%s,%s,%s,%.5f'
%
(check_result, file_sha1, tag, sim_hash, label, sim))
if
count >
=
500
:
break
print
(
'测试mmdthash总数:%d'
%
count)
print
(
'检测正确总数:%d'
%
(TP
+
TN))
print
(
'检测错误总数:%d'
%
(FP
+
FN))
print
(
'检测TP总数:%d'
%
TP)
print
(
'检测TN总数:%d'
%
TN)
print
(
'检测FP总数:%d'
%
FP)
print
(
'检测FN总数:%d'
%
FN)
print
(
'检测准确率ACC:%.3f'
%
((TP
+
TN)
/
(TP
+
TN
+
FP
+
FN)))
print
(
'检测精确率PRE:%.3f'
%
(TP
/
(TP
+
FP)))
print
(
'检测召回率REC:%.3f'
%
(TP
/
(TP
+
FN)))
def
main():
mmdt_scan_online_check()
if
__name__
=
=
'__main__'
:
main()
import
sys
import
requests
from
python_mmdt.mmdt.common
import
mmdt_load
dlt
=
0.95
def
mmdt_scan_online_check():
file_name
=
sys.argv[
1
]
features
=
mmdt_load(file_name)
TP
=
0
TN
=
0
FP
=
0
FN
=
0
count
=
0
print
(
'检测结果,文件md5,真实标签,相似文件,预测标签,相似度'
)
for
feature
in
features:
count
+
=
1
tmp
=
feature.strip().split(
":"
)
file_mmdt
=
':'
.join(tmp[:
2
])
tag
=
tmp[
2
]
file_sha1
=
tmp[
3
]
data
=
{
"md5"
: file_sha1,
"sha1"
: file_sha1,
"file_name"
: file_sha1,
"mmdt"
: file_mmdt,
"data"
: {}
}
r
=
requests.post(url
=
'74eK9s2c8@1M7q4)9K6b7g2)9J5c8W2)9J5c8U0p5@1y4W2)9J5k6e0f1$3i4K6u0W2x3U0b7J5i4K6u0W2x3e0R3@1i4K6u0r3L8h3#2V1N6q4)9J5c8Y4y4U0j5h3&6Q4x3U0M7`.
, json
=
data)
r_data
=
r.json()
if
r_data.get(
'status'
,
0
)
=
=
20001
:
status
=
r_data.get(
'status'
,
0
)
message
=
r_data.get(
'message'
, '')
print
(
'文件md5: %s, 状态码: %d, 提交信息: %s'
%
(file_sha1, status, message))
else
:
label
=
r_data.get(
'data'
, {}).get(
'label'
,
'unknown'
)
sim_hash
=
r_data.get(
'data'
, {}).get(
'similars'
, [])[
0
].get(
'hash'
,
'None'
)
sim
=
r_data.get(
'data'
, {}).get(
'similars'
, [])[
0
].get(
'sim'
,
0.0
)
check_result
=
''
if
tag
=
=
label
and
sim > dlt:
TP
+
=
1
check_result
=
'正确'
elif
tag
=
=
'clean'
and
sim > dlt:
FP
+
=
1
check_result
=
'错误'
elif
tag
=
=
'clean'
and
sim <
=
dlt:
TN
+
=
1
check_result
=
'正确'
else
:
FN
+
=
1
check_result
=
'错误'
print
(
'%s,%s,%s,%s,%s,%.5f'
%
(check_result, file_sha1, tag, sim_hash, label, sim))
if
count >
=
500
:
break
print
(
'测试mmdthash总数:%d'
%
count)
print
(
'检测正确总数:%d'
%
(TP
+
TN))
print
(
'检测错误总数:%d'
%
(FP
+
FN))
print
(
'检测TP总数:%d'
%
TP)
print
(
'检测TN总数:%d'
%
TN)
print
(
'检测FP总数:%d'
%
FP)
print
(
'检测FN总数:%d'
%
FN)
print
(
'检测准确率ACC:%.3f'
%
((TP
+
TN)
/
(TP
+
TN
+
FP
+
FN)))
print
(
'检测精确率PRE:%.3f'
%
(TP
/
(TP
+
FP)))
print
(
'检测召回率REC:%.3f'
%
(TP
/
(TP
+
FN)))
[培训]内核驱动高级班,冲击BAT一流互联网大厂工作,每周日13:00-18:00直播授课
最后于 2022-1-24 14:46
被大大薇薇编辑
,原因: 增加测试数据
上传的附件: