report almost done
This commit is contained in:
parent
59904646fa
commit
302f53f5ea
21 changed files with 1159 additions and 502 deletions
clustering
org.apache.xerces.dom.CoreDocumentImpl_silhouette.csvorg.apache.xerces.dom.CoreDocumentImpl_stats.csvorg.apache.xerces.dom.CoreDocumentImpl_stats.pngorg.apache.xerces.impl.dtd.DTDGrammar_hierarchical.csvorg.apache.xerces.impl.dtd.DTDGrammar_kmeans.csvorg.apache.xerces.impl.dtd.DTDGrammar_silhouette.csvorg.apache.xerces.impl.dtd.DTDGrammar_stats.csvorg.apache.xerces.impl.dtd.DTDGrammar_stats.pngorg.apache.xerces.impl.xs.traversers.XSDHandler_hierarchical.csvorg.apache.xerces.impl.xs.traversers.XSDHandler_kmeans.csvorg.apache.xerces.impl.xs.traversers.XSDHandler_silhouette.csvorg.apache.xerces.impl.xs.traversers.XSDHandler_stats.csvorg.apache.xerces.impl.xs.traversers.XSDHandler_stats.pngorg.apache.xerces.xinclude.XIncludeHandler_silhouette.csvorg.apache.xerces.xinclude.XIncludeHandler_stats.csvorg.apache.xerces.xinclude.XIncludeHandler_stats.png
prec_recall.pyreport
silhouette.py
|
@ -1,45 +1,45 @@
|
|||
,k_means,hierarchical
|
||||
2,0.7008424223503156,0.3958383820498147
|
||||
3,0.5710705368479275,0.4083355324423938
|
||||
4,0.5612355754261723,0.4894345431495262
|
||||
5,0.45447105698494905,0.49390485171067744
|
||||
6,0.4542950961743021,0.49437178337314974
|
||||
7,0.5169337345938171,0.4996628258355101
|
||||
8,0.5048012323625627,0.504680719000111
|
||||
9,0.4981437021345769,0.5104029882614454
|
||||
10,0.514873610056946,0.39391549620101274
|
||||
11,0.4397616290614397,0.35593829934237226
|
||||
12,0.3966368345309925,0.3965649809723018
|
||||
13,0.40515142998089104,0.4035942512051252
|
||||
14,0.40783453521401053,0.41018624058063885
|
||||
15,0.4239033913796109,0.45557751119565765
|
||||
16,0.42065530265413026,0.47640709656766556
|
||||
17,0.44344469866152514,0.4974425160835303
|
||||
18,0.4400719065542468,0.5290487299051633
|
||||
19,0.44608395823875535,0.5485454650471248
|
||||
20,0.44877269935654723,0.5586056973417746
|
||||
21,0.48118392208651517,0.5385866967307906
|
||||
22,0.48389798035280496,0.538222592035968
|
||||
23,0.48663428414368126,0.550727295003801
|
||||
24,0.5087496231379599,0.5729072600132372
|
||||
25,0.5308958702007723,0.5954078415061489
|
||||
26,0.533742178035476,0.6182602907647171
|
||||
27,0.5366335268898433,0.6415000474402278
|
||||
28,0.5688721496510291,0.6464201697751911
|
||||
29,0.5718756117789308,0.6701951689242575
|
||||
30,0.5749678644659783,0.6738663960033637
|
||||
31,0.5975986067541601,0.6776704976739869
|
||||
32,0.6010454124801283,0.6759936834928909
|
||||
33,0.6047324451505658,0.6780082327270405
|
||||
34,0.6087467116081876,0.6819745883778254
|
||||
35,0.613131689815019,0.6860900076219251
|
||||
36,0.6386708325196511,0.6921336553243742
|
||||
37,0.6449490032291169,0.6964066920515507
|
||||
38,0.6764810977640761,0.6932020971027025
|
||||
39,0.6773895830074159,0.6977143227629022
|
||||
40,0.6918179479278735,0.7024070374495096
|
||||
41,0.6964034645667346,0.7072912915110808
|
||||
42,0.7024471122691838,0.712379056158551
|
||||
43,0.7256701207957181,0.7176833214293175
|
||||
44,0.7230610997944976,0.7232182069292477
|
||||
45,0.7289990873402858,0.7289990873402857
|
||||
2,0.3958383820498147,0.7008424223503156
|
||||
3,0.4083355324423938,0.5710705368479275
|
||||
4,0.4894345431495262,0.5612355754261723
|
||||
5,0.49390485171067744,0.45447105698494905
|
||||
6,0.49437178337314974,0.4542950961743021
|
||||
7,0.4996628258355101,0.5169337345938171
|
||||
8,0.504680719000111,0.5048012323625627
|
||||
9,0.5104029882614454,0.4981437021345769
|
||||
10,0.39391549620101274,0.514873610056946
|
||||
11,0.35593829934237226,0.4397616290614397
|
||||
12,0.3965649809723018,0.3966368345309925
|
||||
13,0.4035942512051252,0.40515142998089104
|
||||
14,0.41018624058063885,0.40783453521401053
|
||||
15,0.45557751119565765,0.4239033913796109
|
||||
16,0.47640709656766556,0.42065530265413026
|
||||
17,0.4974425160835303,0.44344469866152514
|
||||
18,0.5290487299051633,0.4400719065542468
|
||||
19,0.5485454650471248,0.44608395823875535
|
||||
20,0.5586056973417746,0.44877269935654723
|
||||
21,0.5385866967307906,0.48118392208651517
|
||||
22,0.538222592035968,0.48389798035280496
|
||||
23,0.550727295003801,0.48663428414368126
|
||||
24,0.5729072600132372,0.5087496231379599
|
||||
25,0.5954078415061489,0.5308958702007723
|
||||
26,0.6182602907647171,0.533742178035476
|
||||
27,0.6415000474402278,0.5366335268898433
|
||||
28,0.6464201697751911,0.5688721496510291
|
||||
29,0.6701951689242575,0.5718756117789308
|
||||
30,0.6738663960033637,0.5749678644659783
|
||||
31,0.6776704976739869,0.5975986067541601
|
||||
32,0.6759936834928909,0.6010454124801283
|
||||
33,0.6780082327270405,0.6047324451505658
|
||||
34,0.6819745883778254,0.6087467116081876
|
||||
35,0.6860900076219251,0.613131689815019
|
||||
36,0.6921336553243742,0.6386708325196511
|
||||
37,0.6964066920515507,0.6449490032291169
|
||||
38,0.6932020971027025,0.6764810977640761
|
||||
39,0.6977143227629022,0.6773895830074159
|
||||
40,0.7024070374495096,0.6918179479278735
|
||||
41,0.7072912915110808,0.6964034645667346
|
||||
42,0.712379056158551,0.7024471122691838
|
||||
43,0.7176833214293175,0.7256701207957181
|
||||
44,0.7232182069292477,0.7230610997944976
|
||||
45,0.7289990873402857,0.7289990873402858
|
||||
|
|
|
89
clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.csv
Normal file
89
clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.csv
Normal file
|
@ -0,0 +1,89 @@
|
|||
,algorithm,k,min,mean,max
|
||||
0,hierarchical,2,1,58.5,116
|
||||
1,k_means,2,1,58.5,116
|
||||
2,hierarchical,3,1,39.0,113
|
||||
3,k_means,3,1,39.0,115
|
||||
4,hierarchical,4,1,29.25,113
|
||||
5,k_means,4,1,29.25,98
|
||||
6,hierarchical,5,1,23.4,111
|
||||
7,k_means,5,1,23.4,98
|
||||
8,hierarchical,6,1,19.5,111
|
||||
9,k_means,6,1,19.5,98
|
||||
10,hierarchical,7,1,16.714285714285715,97
|
||||
11,k_means,7,1,16.714285714285715,98
|
||||
12,hierarchical,8,1,14.625,97
|
||||
13,k_means,8,1,14.625,98
|
||||
14,hierarchical,9,1,13.0,96
|
||||
15,k_means,9,1,13.0,97
|
||||
16,hierarchical,10,1,11.7,96
|
||||
17,k_means,10,1,11.7,92
|
||||
18,hierarchical,11,1,10.636363636363637,93
|
||||
19,k_means,11,1,10.636363636363637,89
|
||||
20,hierarchical,12,1,9.75,86
|
||||
21,k_means,12,1,9.75,84
|
||||
22,hierarchical,13,1,9.0,84
|
||||
23,k_means,13,1,9.0,83
|
||||
24,hierarchical,14,1,8.357142857142858,84
|
||||
25,k_means,14,1,8.357142857142858,83
|
||||
26,hierarchical,15,1,7.8,84
|
||||
27,k_means,15,1,7.8,77
|
||||
28,hierarchical,16,1,7.3125,84
|
||||
29,k_means,16,1,7.3125,75
|
||||
30,hierarchical,17,1,6.882352941176471,79
|
||||
31,k_means,17,1,6.882352941176471,73
|
||||
32,hierarchical,18,1,6.5,79
|
||||
33,k_means,18,1,6.5,70
|
||||
34,hierarchical,19,1,6.157894736842105,46
|
||||
35,k_means,19,1,6.157894736842105,70
|
||||
36,hierarchical,20,1,5.85,46
|
||||
37,k_means,20,1,5.85,70
|
||||
38,hierarchical,21,1,5.571428571428571,46
|
||||
39,k_means,21,1,5.571428571428571,70
|
||||
40,hierarchical,22,1,5.318181818181818,46
|
||||
41,k_means,22,1,5.318181818181818,70
|
||||
42,hierarchical,23,1,5.086956521739131,46
|
||||
43,k_means,23,1,5.086956521739131,68
|
||||
44,hierarchical,24,1,4.875,46
|
||||
45,k_means,24,1,4.875,66
|
||||
46,hierarchical,25,1,4.68,46
|
||||
47,k_means,25,1,4.68,64
|
||||
48,hierarchical,26,1,4.5,46
|
||||
49,k_means,26,1,4.5,62
|
||||
50,hierarchical,27,1,4.333333333333333,46
|
||||
51,k_means,27,1,4.333333333333333,60
|
||||
52,hierarchical,28,1,4.178571428571429,46
|
||||
53,k_means,28,1,4.178571428571429,60
|
||||
54,hierarchical,29,1,4.0344827586206895,46
|
||||
55,k_means,29,1,4.0344827586206895,58
|
||||
56,hierarchical,30,1,3.9,46
|
||||
57,k_means,30,1,3.9,57
|
||||
58,hierarchical,31,1,3.774193548387097,46
|
||||
59,k_means,31,1,3.774193548387097,56
|
||||
60,hierarchical,32,1,3.65625,46
|
||||
61,k_means,32,1,3.65625,56
|
||||
62,hierarchical,33,1,3.5454545454545454,46
|
||||
63,k_means,33,1,3.5454545454545454,56
|
||||
64,hierarchical,34,1,3.4411764705882355,46
|
||||
65,k_means,34,1,3.4411764705882355,55
|
||||
66,hierarchical,35,1,3.342857142857143,46
|
||||
67,k_means,35,1,3.342857142857143,54
|
||||
68,hierarchical,36,1,3.25,46
|
||||
69,k_means,36,1,3.25,54
|
||||
70,hierarchical,37,1,3.1621621621621623,46
|
||||
71,k_means,37,1,3.1621621621621623,53
|
||||
72,hierarchical,38,1,3.0789473684210527,46
|
||||
73,k_means,38,1,3.0789473684210527,53
|
||||
74,hierarchical,39,1,3.0,46
|
||||
75,k_means,39,1,3.0,52
|
||||
76,hierarchical,40,1,2.925,46
|
||||
77,k_means,40,1,2.925,51
|
||||
78,hierarchical,41,1,2.8536585365853657,46
|
||||
79,k_means,41,1,2.8536585365853657,50
|
||||
80,hierarchical,42,1,2.7857142857142856,46
|
||||
81,k_means,42,1,2.7857142857142856,49
|
||||
82,hierarchical,43,1,2.7209302325581395,46
|
||||
83,k_means,43,1,2.7209302325581395,48
|
||||
84,hierarchical,44,1,2.659090909090909,46
|
||||
85,k_means,44,1,2.659090909090909,47
|
||||
86,hierarchical,45,1,2.6,46
|
||||
87,k_means,45,1,2.6,46
|
|
BIN
clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.png
Normal file
BIN
clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.png
Normal file
Binary file not shown.
After (image error) Size: 98 KiB |
|
@ -1,92 +1,92 @@
|
|||
,cluster
|
||||
getGrammarDescription,5
|
||||
getGrammarDescription,0
|
||||
getElementDeclIsExternal,0
|
||||
getAttributeDeclIsExternal,0
|
||||
getAttributeDeclIndex,42
|
||||
startDTD,2
|
||||
startParameterEntity,6
|
||||
startExternalSubset,41
|
||||
endParameterEntity,6
|
||||
endExternalSubset,41
|
||||
elementDecl,31
|
||||
attributeDecl,39
|
||||
internalEntityDecl,40
|
||||
externalEntityDecl,40
|
||||
unparsedEntityDecl,40
|
||||
notationDecl,38
|
||||
endDTD,8
|
||||
setDTDSource,23
|
||||
getDTDSource,23
|
||||
textDecl,5
|
||||
comment,5
|
||||
processingInstruction,5
|
||||
startAttlist,5
|
||||
endAttlist,5
|
||||
startConditional,5
|
||||
ignoredCharacters,5
|
||||
endConditional,5
|
||||
setDTDContentModelSource,11
|
||||
getDTDContentModelSource,11
|
||||
startContentModel,33
|
||||
startGroup,53
|
||||
pcdata,51
|
||||
element,34
|
||||
separator,44
|
||||
occurrence,32
|
||||
endGroup,35
|
||||
any,5
|
||||
empty,5
|
||||
endContentModel,5
|
||||
isNamespaceAware,5
|
||||
getSymbolTable,52
|
||||
getFirstElementDeclIndex,7
|
||||
getNextElementDeclIndex,7
|
||||
getElementDeclIndex,55
|
||||
getContentSpecType,37
|
||||
getElementDecl,54
|
||||
getElementDeclName,8
|
||||
getFirstAttributeDeclIndex,20
|
||||
getNextAttributeDeclIndex,57
|
||||
getAttributeDecl,28
|
||||
isCDATAAttribute,47
|
||||
getEntityDeclIndex,5
|
||||
getEntityDecl,1
|
||||
getNotationDeclIndex,5
|
||||
getNotationDecl,10
|
||||
getContentSpec,9
|
||||
getContentSpecIndex,19
|
||||
getContentSpecAsString,43
|
||||
printElements,50
|
||||
printAttributes,49
|
||||
addContentSpecToElement,29
|
||||
getElementContentModelValidator,25
|
||||
createElementDecl,36
|
||||
setElementDecl,17
|
||||
putElementNameMapping,5
|
||||
setFirstAttributeDeclIndex,20
|
||||
setContentSpecIndex,19
|
||||
createAttributeDecl,18
|
||||
setAttributeDecl,48
|
||||
createContentSpec,21
|
||||
setContentSpec,9
|
||||
createEntityDecl,16
|
||||
setEntityDecl,1
|
||||
createNotationDecl,56
|
||||
setNotationDecl,10
|
||||
addContentSpecNode,4
|
||||
addUniqueLeafNode,4
|
||||
initializeContentModelStack,2
|
||||
isImmutable,24
|
||||
appendContentSpec,45
|
||||
printAttribute,26
|
||||
createChildModel,3
|
||||
buildSyntaxTree,3
|
||||
contentSpecTree,46
|
||||
ensureElementDeclCapacity,27
|
||||
ensureAttributeDeclCapacity,30
|
||||
ensureEntityDeclCapacity,14
|
||||
ensureNotationDeclCapacity,13
|
||||
ensureContentSpecCapacity,12
|
||||
resize,5
|
||||
isEntityDeclared,22
|
||||
isEntityUnparsed,15
|
||||
getAttributeDeclIndex,0
|
||||
startDTD,0
|
||||
startParameterEntity,0
|
||||
startExternalSubset,0
|
||||
endParameterEntity,0
|
||||
endExternalSubset,0
|
||||
elementDecl,1
|
||||
attributeDecl,1
|
||||
internalEntityDecl,0
|
||||
externalEntityDecl,0
|
||||
unparsedEntityDecl,0
|
||||
notationDecl,0
|
||||
endDTD,0
|
||||
setDTDSource,0
|
||||
getDTDSource,0
|
||||
textDecl,0
|
||||
comment,0
|
||||
processingInstruction,0
|
||||
startAttlist,0
|
||||
endAttlist,0
|
||||
startConditional,0
|
||||
ignoredCharacters,0
|
||||
endConditional,0
|
||||
setDTDContentModelSource,0
|
||||
getDTDContentModelSource,0
|
||||
startContentModel,0
|
||||
startGroup,0
|
||||
pcdata,0
|
||||
element,0
|
||||
separator,0
|
||||
occurrence,0
|
||||
endGroup,0
|
||||
any,0
|
||||
empty,0
|
||||
endContentModel,0
|
||||
isNamespaceAware,0
|
||||
getSymbolTable,0
|
||||
getFirstElementDeclIndex,0
|
||||
getNextElementDeclIndex,0
|
||||
getElementDeclIndex,0
|
||||
getContentSpecType,0
|
||||
getElementDecl,0
|
||||
getElementDeclName,0
|
||||
getFirstAttributeDeclIndex,0
|
||||
getNextAttributeDeclIndex,0
|
||||
getAttributeDecl,0
|
||||
isCDATAAttribute,0
|
||||
getEntityDeclIndex,0
|
||||
getEntityDecl,0
|
||||
getNotationDeclIndex,0
|
||||
getNotationDecl,0
|
||||
getContentSpec,0
|
||||
getContentSpecIndex,0
|
||||
getContentSpecAsString,0
|
||||
printElements,0
|
||||
printAttributes,0
|
||||
addContentSpecToElement,0
|
||||
getElementContentModelValidator,0
|
||||
createElementDecl,0
|
||||
setElementDecl,0
|
||||
putElementNameMapping,0
|
||||
setFirstAttributeDeclIndex,0
|
||||
setContentSpecIndex,0
|
||||
createAttributeDecl,0
|
||||
setAttributeDecl,0
|
||||
createContentSpec,0
|
||||
setContentSpec,0
|
||||
createEntityDecl,0
|
||||
setEntityDecl,0
|
||||
createNotationDecl,0
|
||||
setNotationDecl,0
|
||||
addContentSpecNode,0
|
||||
addUniqueLeafNode,0
|
||||
initializeContentModelStack,0
|
||||
isImmutable,0
|
||||
appendContentSpec,0
|
||||
printAttribute,0
|
||||
createChildModel,0
|
||||
buildSyntaxTree,0
|
||||
contentSpecTree,0
|
||||
ensureElementDeclCapacity,0
|
||||
ensureAttributeDeclCapacity,0
|
||||
ensureEntityDeclCapacity,0
|
||||
ensureNotationDeclCapacity,0
|
||||
ensureContentSpecCapacity,0
|
||||
resize,0
|
||||
isEntityDeclared,0
|
||||
isEntityUnparsed,0
|
||||
|
|
|
|
@ -1,92 +1,92 @@
|
|||
,cluster
|
||||
getGrammarDescription,1
|
||||
getElementDeclIsExternal,0
|
||||
getAttributeDeclIsExternal,0
|
||||
getAttributeDeclIndex,1
|
||||
startDTD,1
|
||||
startParameterEntity,1
|
||||
startExternalSubset,1
|
||||
endParameterEntity,1
|
||||
endExternalSubset,1
|
||||
elementDecl,0
|
||||
attributeDecl,0
|
||||
internalEntityDecl,1
|
||||
externalEntityDecl,1
|
||||
unparsedEntityDecl,1
|
||||
notationDecl,1
|
||||
endDTD,0
|
||||
setDTDSource,1
|
||||
getDTDSource,1
|
||||
textDecl,1
|
||||
comment,1
|
||||
processingInstruction,1
|
||||
startAttlist,1
|
||||
endAttlist,1
|
||||
startConditional,1
|
||||
ignoredCharacters,1
|
||||
endConditional,1
|
||||
setDTDContentModelSource,1
|
||||
getDTDContentModelSource,1
|
||||
startContentModel,1
|
||||
startGroup,1
|
||||
pcdata,1
|
||||
element,1
|
||||
separator,1
|
||||
occurrence,1
|
||||
endGroup,1
|
||||
any,1
|
||||
empty,1
|
||||
endContentModel,1
|
||||
isNamespaceAware,1
|
||||
getSymbolTable,1
|
||||
getFirstElementDeclIndex,1
|
||||
getNextElementDeclIndex,1
|
||||
getElementDeclIndex,1
|
||||
getContentSpecType,0
|
||||
getGrammarDescription,6
|
||||
getElementDeclIsExternal,50
|
||||
getAttributeDeclIsExternal,43
|
||||
getAttributeDeclIndex,36
|
||||
startDTD,13
|
||||
startParameterEntity,23
|
||||
startExternalSubset,54
|
||||
endParameterEntity,23
|
||||
endExternalSubset,54
|
||||
elementDecl,5
|
||||
attributeDecl,4
|
||||
internalEntityDecl,17
|
||||
externalEntityDecl,17
|
||||
unparsedEntityDecl,17
|
||||
notationDecl,40
|
||||
endDTD,29
|
||||
setDTDSource,53
|
||||
getDTDSource,53
|
||||
textDecl,6
|
||||
comment,6
|
||||
processingInstruction,6
|
||||
startAttlist,6
|
||||
endAttlist,6
|
||||
startConditional,6
|
||||
ignoredCharacters,6
|
||||
endConditional,6
|
||||
setDTDContentModelSource,51
|
||||
getDTDContentModelSource,51
|
||||
startContentModel,28
|
||||
startGroup,39
|
||||
pcdata,52
|
||||
element,2
|
||||
separator,49
|
||||
occurrence,24
|
||||
endGroup,27
|
||||
any,6
|
||||
empty,6
|
||||
endContentModel,6
|
||||
isNamespaceAware,6
|
||||
getSymbolTable,56
|
||||
getFirstElementDeclIndex,47
|
||||
getNextElementDeclIndex,47
|
||||
getElementDeclIndex,57
|
||||
getContentSpecType,38
|
||||
getElementDecl,0
|
||||
getElementDeclName,0
|
||||
getFirstAttributeDeclIndex,0
|
||||
getNextAttributeDeclIndex,0
|
||||
getAttributeDecl,0
|
||||
isCDATAAttribute,1
|
||||
getEntityDeclIndex,1
|
||||
getEntityDecl,0
|
||||
getNotationDeclIndex,1
|
||||
getNotationDecl,0
|
||||
getContentSpec,0
|
||||
getContentSpecIndex,0
|
||||
getContentSpecAsString,0
|
||||
printElements,1
|
||||
printAttributes,1
|
||||
addContentSpecToElement,1
|
||||
getElementContentModelValidator,0
|
||||
createElementDecl,0
|
||||
setElementDecl,0
|
||||
putElementNameMapping,1
|
||||
setFirstAttributeDeclIndex,0
|
||||
setContentSpecIndex,0
|
||||
createAttributeDecl,0
|
||||
setAttributeDecl,0
|
||||
createContentSpec,0
|
||||
setContentSpec,0
|
||||
createEntityDecl,0
|
||||
setEntityDecl,0
|
||||
createNotationDecl,1
|
||||
setNotationDecl,0
|
||||
addContentSpecNode,1
|
||||
addUniqueLeafNode,1
|
||||
initializeContentModelStack,1
|
||||
isImmutable,1
|
||||
appendContentSpec,1
|
||||
getElementDeclName,29
|
||||
getFirstAttributeDeclIndex,3
|
||||
getNextAttributeDeclIndex,46
|
||||
getAttributeDecl,25
|
||||
isCDATAAttribute,44
|
||||
getEntityDeclIndex,6
|
||||
getEntityDecl,8
|
||||
getNotationDeclIndex,6
|
||||
getNotationDecl,10
|
||||
getContentSpec,41
|
||||
getContentSpecIndex,12
|
||||
getContentSpecAsString,37
|
||||
printElements,55
|
||||
printAttributes,35
|
||||
addContentSpecToElement,20
|
||||
getElementContentModelValidator,21
|
||||
createElementDecl,33
|
||||
setElementDecl,16
|
||||
putElementNameMapping,6
|
||||
setFirstAttributeDeclIndex,3
|
||||
setContentSpecIndex,12
|
||||
createAttributeDecl,19
|
||||
setAttributeDecl,7
|
||||
createContentSpec,41
|
||||
setContentSpec,9
|
||||
createEntityDecl,31
|
||||
setEntityDecl,8
|
||||
createNotationDecl,32
|
||||
setNotationDecl,10
|
||||
addContentSpecNode,18
|
||||
addUniqueLeafNode,18
|
||||
initializeContentModelStack,13
|
||||
isImmutable,6
|
||||
appendContentSpec,42
|
||||
printAttribute,1
|
||||
createChildModel,1
|
||||
buildSyntaxTree,1
|
||||
contentSpecTree,1
|
||||
ensureElementDeclCapacity,1
|
||||
ensureAttributeDeclCapacity,1
|
||||
ensureEntityDeclCapacity,1
|
||||
ensureNotationDeclCapacity,1
|
||||
ensureContentSpecCapacity,1
|
||||
resize,1
|
||||
isEntityDeclared,1
|
||||
isEntityUnparsed,0
|
||||
createChildModel,11
|
||||
buildSyntaxTree,11
|
||||
contentSpecTree,30
|
||||
ensureElementDeclCapacity,15
|
||||
ensureAttributeDeclCapacity,14
|
||||
ensureEntityDeclCapacity,22
|
||||
ensureNotationDeclCapacity,26
|
||||
ensureContentSpecCapacity,34
|
||||
resize,6
|
||||
isEntityDeclared,48
|
||||
isEntityUnparsed,45
|
||||
|
|
|
|
@ -1,64 +1,64 @@
|
|||
,k_means,hierarchical
|
||||
2,0.43549549160206547,0.22916634455195753
|
||||
3,0.3737398924595095,0.2246280732293034
|
||||
4,0.3557451009153901,0.22489420158108267
|
||||
5,0.23295505680144496,0.23659327576115802
|
||||
6,0.262133112331066,0.1944787865029721
|
||||
7,0.2578980101543562,0.14449036253228517
|
||||
8,0.2549368125378225,0.14148366678653188
|
||||
9,0.2774793093993747,0.13842552961645824
|
||||
10,0.29633149188806335,0.17251507022640497
|
||||
11,0.28457149559807815,0.20347568890084347
|
||||
12,0.2774764884391462,0.23906895503283213
|
||||
13,0.2807117319594596,0.2433263434151139
|
||||
14,0.2756438988231549,0.2378679295617759
|
||||
15,0.2725133030686268,0.23691994972126937
|
||||
16,0.26609972785171476,0.23116431400607626
|
||||
17,0.2622978716191777,0.250626112587838
|
||||
18,0.2599277555662332,0.25367962227891766
|
||||
19,0.2627008352505403,0.27152241207311917
|
||||
20,0.27904812684322156,0.2937526253744639
|
||||
21,0.2862853638532431,0.29866907908096096
|
||||
22,0.28363618305324206,0.2982900685039696
|
||||
23,0.27298124922178313,0.29267556171442216
|
||||
24,0.2755401967064185,0.30932258932020334
|
||||
25,0.2699256899168711,0.30931433471981734
|
||||
26,0.27327610109462835,0.309284891816073
|
||||
27,0.2741779110906256,0.30820922828647973
|
||||
28,0.2772726745209296,0.306394576589556
|
||||
29,0.2763152122041744,0.31629054291989955
|
||||
30,0.27822954116587556,0.31889378927031037
|
||||
31,0.2765547788352012,0.31793632695355517
|
||||
32,0.2873045247363621,0.3198803243841521
|
||||
33,0.28417974562649284,0.3322984924566154
|
||||
34,0.2685472504040367,0.33205224383012144
|
||||
35,0.2640970877653046,0.32946433944653786
|
||||
36,0.26594127941463497,0.34064178452545657
|
||||
37,0.2671662834055061,0.34279546744648637
|
||||
38,0.26972862144514015,0.3520414342812306
|
||||
39,0.2745566131731437,0.35684038034252413
|
||||
40,0.3085760240111521,0.34927826706954956
|
||||
41,0.32756637032777863,0.3490443084779255
|
||||
42,0.3310796986888577,0.34372287471805796
|
||||
43,0.32889480000768656,0.32421890240508233
|
||||
44,0.31610864049926274,0.27315698867962007
|
||||
45,0.3140921194105564,0.27051011105427114
|
||||
46,0.3088953240503273,0.29627121773250714
|
||||
47,0.2693097731576138,0.32261382027270064
|
||||
48,0.2809797636777669,0.3299248655060567
|
||||
49,0.29384518410058824,0.3171387059976329
|
||||
50,0.29793575895571417,0.3442080317722919
|
||||
51,0.3025569827442159,0.3408776851426114
|
||||
52,0.32032808958922193,0.3408776851426114
|
||||
53,0.33852852210954587,0.33765907834246356
|
||||
54,0.339541278009214,0.36565310355269914
|
||||
55,0.35774171052953796,0.3818328805784584
|
||||
56,0.37594214304986195,0.3866470678901348
|
||||
57,0.4080257854586148,0.3919955336887361
|
||||
58,0.4046954388289342,0.3979724365432809
|
||||
59,0.4046954388289342,0.38857621891133143
|
||||
60,0.3931263574608019,0.3953492191827632
|
||||
61,0.38155727609266954,0.38748610984623766
|
||||
62,0.37132316722174985,0.39516141319506437
|
||||
63,0.36810456042160206,0.3850224051641811
|
||||
64,0.3565354790534698,0.3785851915638855
|
||||
2,0.22916634455195753,0.43549549160206547
|
||||
3,0.2246280732293034,0.3737398924595095
|
||||
4,0.22489420158108267,0.3557451009153901
|
||||
5,0.23659327576115802,0.23295505680144496
|
||||
6,0.1944787865029721,0.262133112331066
|
||||
7,0.14449036253228517,0.2578980101543562
|
||||
8,0.14148366678653188,0.2549368125378225
|
||||
9,0.13842552961645824,0.2774793093993747
|
||||
10,0.17251507022640497,0.29633149188806335
|
||||
11,0.20347568890084347,0.28457149559807815
|
||||
12,0.23906895503283213,0.2774764884391462
|
||||
13,0.2433263434151139,0.2807117319594596
|
||||
14,0.2378679295617759,0.2756438988231549
|
||||
15,0.23691994972126937,0.2725133030686268
|
||||
16,0.23116431400607626,0.26609972785171476
|
||||
17,0.250626112587838,0.2622978716191777
|
||||
18,0.25367962227891766,0.2599277555662332
|
||||
19,0.27152241207311917,0.2627008352505403
|
||||
20,0.2937526253744639,0.27904812684322156
|
||||
21,0.29866907908096096,0.2862853638532431
|
||||
22,0.2982900685039696,0.28363618305324206
|
||||
23,0.29267556171442216,0.27298124922178313
|
||||
24,0.30932258932020334,0.2755401967064185
|
||||
25,0.30931433471981734,0.2699256899168711
|
||||
26,0.309284891816073,0.27327610109462835
|
||||
27,0.30820922828647973,0.2741779110906256
|
||||
28,0.306394576589556,0.2772726745209296
|
||||
29,0.31629054291989955,0.2763152122041744
|
||||
30,0.31889378927031037,0.27822954116587556
|
||||
31,0.31793632695355517,0.2765547788352012
|
||||
32,0.3198803243841521,0.2873045247363621
|
||||
33,0.3322984924566154,0.28417974562649284
|
||||
34,0.33205224383012144,0.2685472504040367
|
||||
35,0.32946433944653786,0.2640970877653046
|
||||
36,0.34064178452545657,0.26594127941463497
|
||||
37,0.34279546744648637,0.2671662834055061
|
||||
38,0.3520414342812306,0.26972862144514015
|
||||
39,0.35684038034252413,0.2745566131731437
|
||||
40,0.34927826706954956,0.3085760240111521
|
||||
41,0.3490443084779255,0.32756637032777863
|
||||
42,0.34372287471805796,0.3310796986888577
|
||||
43,0.32421890240508233,0.32889480000768656
|
||||
44,0.27315698867962007,0.31610864049926274
|
||||
45,0.27051011105427114,0.3140921194105564
|
||||
46,0.29627121773250714,0.3088953240503273
|
||||
47,0.32261382027270064,0.2693097731576138
|
||||
48,0.3299248655060567,0.2809797636777669
|
||||
49,0.3171387059976329,0.29384518410058824
|
||||
50,0.3442080317722919,0.29793575895571417
|
||||
51,0.3408776851426114,0.3025569827442159
|
||||
52,0.3408776851426114,0.32032808958922193
|
||||
53,0.33765907834246356,0.33852852210954587
|
||||
54,0.36565310355269914,0.339541278009214
|
||||
55,0.3818328805784584,0.35774171052953796
|
||||
56,0.3866470678901348,0.37594214304986195
|
||||
57,0.3919955336887361,0.4080257854586148
|
||||
58,0.3979724365432809,0.4046954388289342
|
||||
59,0.38857621891133143,0.4046954388289342
|
||||
60,0.3953492191827632,0.3931263574608019
|
||||
61,0.38748610984623766,0.38155727609266954
|
||||
62,0.39516141319506437,0.37132316722174985
|
||||
63,0.3850224051641811,0.36810456042160206
|
||||
64,0.3785851915638855,0.3565354790534698
|
||||
|
|
|
127
clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.csv
Normal file
127
clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.csv
Normal file
|
@ -0,0 +1,127 @@
|
|||
,algorithm,k,min,mean,max
|
||||
0,hierarchical,2,2,45.5,89
|
||||
1,k_means,2,29,45.5,62
|
||||
2,hierarchical,3,2,30.333333333333332,85
|
||||
3,k_means,3,3,30.333333333333332,62
|
||||
4,hierarchical,4,2,22.75,82
|
||||
5,k_means,4,3,22.75,56
|
||||
6,hierarchical,5,2,18.2,64
|
||||
7,k_means,5,3,18.2,55
|
||||
8,hierarchical,6,2,15.166666666666666,57
|
||||
9,k_means,6,3,15.166666666666666,51
|
||||
10,hierarchical,7,1,13.0,57
|
||||
11,k_means,7,2,13.0,50
|
||||
12,hierarchical,8,1,11.375,57
|
||||
13,k_means,8,1,11.375,57
|
||||
14,hierarchical,9,1,10.11111111111111,57
|
||||
15,k_means,9,1,10.11111111111111,57
|
||||
16,hierarchical,10,1,9.1,50
|
||||
17,k_means,10,1,9.1,51
|
||||
18,hierarchical,11,1,8.272727272727273,46
|
||||
19,k_means,11,1,8.272727272727273,50
|
||||
20,hierarchical,12,1,7.583333333333333,46
|
||||
21,k_means,12,1,7.583333333333333,47
|
||||
22,hierarchical,13,1,7.0,46
|
||||
23,k_means,13,1,7.0,46
|
||||
24,hierarchical,14,1,6.5,46
|
||||
25,k_means,14,1,6.5,46
|
||||
26,hierarchical,15,1,6.066666666666666,46
|
||||
27,k_means,15,1,6.066666666666666,46
|
||||
28,hierarchical,16,1,5.6875,46
|
||||
29,k_means,16,1,5.6875,46
|
||||
30,hierarchical,17,1,5.352941176470588,39
|
||||
31,k_means,17,1,5.352941176470588,44
|
||||
32,hierarchical,18,1,5.055555555555555,39
|
||||
33,k_means,18,1,5.055555555555555,44
|
||||
34,hierarchical,19,1,4.7894736842105265,34
|
||||
35,k_means,19,1,4.7894736842105265,43
|
||||
36,hierarchical,20,1,4.55,34
|
||||
37,k_means,20,1,4.55,41
|
||||
38,hierarchical,21,1,4.333333333333333,34
|
||||
39,k_means,21,1,4.333333333333333,40
|
||||
40,hierarchical,22,1,4.136363636363637,34
|
||||
41,k_means,22,1,4.136363636363637,40
|
||||
42,hierarchical,23,1,3.9565217391304346,31
|
||||
43,k_means,23,1,3.9565217391304346,40
|
||||
44,hierarchical,24,1,3.7916666666666665,31
|
||||
45,k_means,24,1,3.7916666666666665,38
|
||||
46,hierarchical,25,1,3.64,31
|
||||
47,k_means,25,1,3.64,38
|
||||
48,hierarchical,26,1,3.5,31
|
||||
49,k_means,26,1,3.5,38
|
||||
50,hierarchical,27,1,3.3703703703703702,31
|
||||
51,k_means,27,1,3.3703703703703702,38
|
||||
52,hierarchical,28,1,3.25,30
|
||||
53,k_means,28,1,3.25,38
|
||||
54,hierarchical,29,1,3.1379310344827585,30
|
||||
55,k_means,29,1,3.1379310344827585,36
|
||||
56,hierarchical,30,1,3.033333333333333,30
|
||||
57,k_means,30,1,3.033333333333333,35
|
||||
58,hierarchical,31,1,2.935483870967742,30
|
||||
59,k_means,31,1,2.935483870967742,35
|
||||
60,hierarchical,32,1,2.84375,30
|
||||
61,k_means,32,1,2.84375,35
|
||||
62,hierarchical,33,1,2.757575757575758,30
|
||||
63,k_means,33,1,2.757575757575758,33
|
||||
64,hierarchical,34,1,2.676470588235294,30
|
||||
65,k_means,34,1,2.676470588235294,33
|
||||
66,hierarchical,35,1,2.6,30
|
||||
67,k_means,35,1,2.6,33
|
||||
68,hierarchical,36,1,2.5277777777777777,30
|
||||
69,k_means,36,1,2.5277777777777777,33
|
||||
70,hierarchical,37,1,2.4594594594594597,30
|
||||
71,k_means,37,1,2.4594594594594597,33
|
||||
72,hierarchical,38,1,2.3947368421052633,30
|
||||
73,k_means,38,1,2.3947368421052633,33
|
||||
74,hierarchical,39,1,2.3333333333333335,29
|
||||
75,k_means,39,1,2.3333333333333335,32
|
||||
76,hierarchical,40,1,2.275,29
|
||||
77,k_means,40,1,2.275,32
|
||||
78,hierarchical,41,1,2.2195121951219514,29
|
||||
79,k_means,41,1,2.2195121951219514,32
|
||||
80,hierarchical,42,1,2.1666666666666665,29
|
||||
81,k_means,42,1,2.1666666666666665,32
|
||||
82,hierarchical,43,1,2.116279069767442,29
|
||||
83,k_means,43,1,2.116279069767442,31
|
||||
84,hierarchical,44,1,2.0681818181818183,29
|
||||
85,k_means,44,1,2.0681818181818183,31
|
||||
86,hierarchical,45,1,2.022222222222222,29
|
||||
87,k_means,45,1,2.022222222222222,31
|
||||
88,hierarchical,46,1,1.9782608695652173,29
|
||||
89,k_means,46,1,1.9782608695652173,29
|
||||
90,hierarchical,47,1,1.9361702127659575,28
|
||||
91,k_means,47,1,1.9361702127659575,27
|
||||
92,hierarchical,48,1,1.8958333333333333,17
|
||||
93,k_means,48,1,1.8958333333333333,27
|
||||
94,hierarchical,49,1,1.8571428571428572,17
|
||||
95,k_means,49,1,1.8571428571428572,27
|
||||
96,hierarchical,50,1,1.82,17
|
||||
97,k_means,50,1,1.82,25
|
||||
98,hierarchical,51,1,1.7843137254901962,17
|
||||
99,k_means,51,1,1.7843137254901962,25
|
||||
100,hierarchical,52,1,1.75,17
|
||||
101,k_means,52,1,1.75,25
|
||||
102,hierarchical,53,1,1.7169811320754718,17
|
||||
103,k_means,53,1,1.7169811320754718,25
|
||||
104,hierarchical,54,1,1.6851851851851851,17
|
||||
105,k_means,54,1,1.6851851851851851,23
|
||||
106,hierarchical,55,1,1.6545454545454545,17
|
||||
107,k_means,55,1,1.6545454545454545,21
|
||||
108,hierarchical,56,1,1.625,17
|
||||
109,k_means,56,1,1.625,20
|
||||
110,hierarchical,57,1,1.5964912280701755,17
|
||||
111,k_means,57,1,1.5964912280701755,19
|
||||
112,hierarchical,58,1,1.5689655172413792,17
|
||||
113,k_means,58,1,1.5689655172413792,18
|
||||
114,hierarchical,59,1,1.5423728813559323,17
|
||||
115,k_means,59,1,1.5423728813559323,18
|
||||
116,hierarchical,60,1,1.5166666666666666,17
|
||||
117,k_means,60,1,1.5166666666666666,17
|
||||
118,hierarchical,61,1,1.4918032786885247,17
|
||||
119,k_means,61,1,1.4918032786885247,17
|
||||
120,hierarchical,62,1,1.467741935483871,17
|
||||
121,k_means,62,1,1.467741935483871,16
|
||||
122,hierarchical,63,1,1.4444444444444444,17
|
||||
123,k_means,63,1,1.4444444444444444,16
|
||||
124,hierarchical,64,1,1.421875,17
|
||||
125,k_means,64,1,1.421875,16
|
|
BIN
clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.png
Normal file
BIN
clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.png
Normal file
Binary file not shown.
After (image error) Size: 98 KiB |
|
@ -11,7 +11,7 @@ constructTrees,0
|
|||
isExistingGrammar,0
|
||||
updateImportListFor,0
|
||||
updateImportListWith,0
|
||||
buildGlobalNameRegistries,0
|
||||
buildGlobalNameRegistries,2
|
||||
traverseSchemas,0
|
||||
needReportTNSError,0
|
||||
addGlobalAttributeDecl,0
|
||||
|
|
|
|
@ -36,7 +36,7 @@ getGrpOrAttrGrpRedefinedByRestriction,0
|
|||
resolveKeyRefs,0
|
||||
getIDRegistry,0
|
||||
getIDRegistry_sub,0
|
||||
storeKeyRef,2
|
||||
storeKeyRef,0
|
||||
resolveSchema,0
|
||||
resolveSchemaSource,0
|
||||
getSchemaDocument,0
|
||||
|
|
|
|
@ -1,64 +1,64 @@
|
|||
,k_means,hierarchical
|
||||
2,0.598553678618089,0.598553678618089
|
||||
3,0.5988635577957939,0.4340479638200015
|
||||
4,0.5768968986366794,0.4446910360233003
|
||||
5,0.5600582308059449,0.44970649031040394
|
||||
6,0.4754961922118064,0.42291318953408236
|
||||
7,0.4716129791423394,0.37302776265331616
|
||||
8,0.4215235973451702,0.16585341129364783
|
||||
9,0.4270412461769427,0.1805562766904707
|
||||
10,0.42033460704259445,0.10021703881198853
|
||||
11,0.3979172260202459,0.11227880527684016
|
||||
12,0.3992377842624971,0.13834960978465374
|
||||
13,0.36351812430049024,0.14637482631499601
|
||||
14,0.36086605127470145,0.15339700393049752
|
||||
15,0.27803229144747893,0.17519153912543511
|
||||
16,0.24764306057751692,0.18163489682652323
|
||||
17,0.2546247662068935,0.1920283064393974
|
||||
18,0.27281600254442556,0.1968887014348958
|
||||
19,0.2705186834360297,0.22320550740329767
|
||||
20,0.29969231483298964,0.2278559856358303
|
||||
21,0.31507175636228785,0.21631113790331308
|
||||
22,0.3170839571491974,0.23240216910856668
|
||||
23,0.3201062001500274,0.23590483919206368
|
||||
24,0.24788100874579763,0.24413294581937137
|
||||
25,0.24932439019964475,0.2514892577758059
|
||||
26,0.25682838168308425,0.2573251636281981
|
||||
27,0.25691108409617125,0.2629575908594159
|
||||
28,0.2606141413445487,0.27452933746874875
|
||||
29,0.2538973293819504,0.27138907448677696
|
||||
30,0.26300677134410877,0.27608901099722993
|
||||
31,0.25958787047274295,0.2734068655042204
|
||||
32,0.2620577633391267,0.26668692055998694
|
||||
33,0.2677665846189286,0.27190541883537933
|
||||
34,0.2698493206362974,0.2737953942290021
|
||||
35,0.26871523120875485,0.2792490491212266
|
||||
36,0.2706224838853019,0.285255446778457
|
||||
37,0.27443698923839605,0.28791640737048424
|
||||
38,0.27814176822064324,0.279340819685821
|
||||
39,0.27606460269195954,0.27183941054653343
|
||||
40,0.27307694582354536,0.2763117563875985
|
||||
41,0.27681213230462487,0.28090687018324295
|
||||
42,0.2763401507651925,0.2823757748346625
|
||||
43,0.2905756087008992,0.284106939165533
|
||||
44,0.2901305465431984,0.28892051024774673
|
||||
45,0.2866682178814574,0.31067801214219776
|
||||
46,0.2748147271365624,0.33271255209429573
|
||||
47,0.2824570955002154,0.33809240352772785
|
||||
48,0.28532199483886955,0.342466005901906
|
||||
49,0.3194897125271686,0.3472678571815208
|
||||
50,0.32227149882332984,0.3494143489069156
|
||||
51,0.32507315719109064,0.3553262507378467
|
||||
52,0.3483927215781152,0.3577437943048381
|
||||
53,0.35116863658039477,0.36265544445738723
|
||||
54,0.35397195095412226,0.3652777392559547
|
||||
55,0.35681293946359083,0.3635299110583668
|
||||
56,0.35970774816697515,0.36657235082485046
|
||||
57,0.36268328716123316,0.369801843033111
|
||||
58,0.365787994620889,0.373236109725014
|
||||
59,0.36911874689499113,0.3768951711426859
|
||||
60,0.3899012348681259,0.38080172755114144
|
||||
61,0.3925538526988892,0.38498161884368615
|
||||
62,0.3952064705296526,0.38946438071227807
|
||||
63,0.40051170619117926,0.39428392137375445
|
||||
64,0.4003435986724249,0.4003771346837245
|
||||
3,0.4340479638200015,0.5988635577957939
|
||||
4,0.4446910360233003,0.5768968986366794
|
||||
5,0.44970649031040394,0.5600582308059449
|
||||
6,0.42291318953408236,0.4754961922118064
|
||||
7,0.37302776265331616,0.4716129791423394
|
||||
8,0.16585341129364783,0.4215235973451702
|
||||
9,0.1805562766904707,0.4270412461769427
|
||||
10,0.10021703881198853,0.42033460704259445
|
||||
11,0.11227880527684016,0.3979172260202459
|
||||
12,0.13834960978465374,0.3992377842624971
|
||||
13,0.14637482631499601,0.36351812430049024
|
||||
14,0.15339700393049752,0.36086605127470145
|
||||
15,0.17519153912543511,0.27803229144747893
|
||||
16,0.18163489682652323,0.24764306057751692
|
||||
17,0.1920283064393974,0.2546247662068935
|
||||
18,0.1968887014348958,0.27281600254442556
|
||||
19,0.22320550740329767,0.2705186834360297
|
||||
20,0.2278559856358303,0.29969231483298964
|
||||
21,0.21631113790331308,0.31507175636228785
|
||||
22,0.23240216910856668,0.3170839571491974
|
||||
23,0.23590483919206368,0.3201062001500274
|
||||
24,0.24413294581937137,0.24788100874579763
|
||||
25,0.2514892577758059,0.24932439019964475
|
||||
26,0.2573251636281981,0.25682838168308425
|
||||
27,0.2629575908594159,0.25691108409617125
|
||||
28,0.27452933746874875,0.2606141413445487
|
||||
29,0.27138907448677696,0.2538973293819504
|
||||
30,0.27608901099722993,0.26300677134410877
|
||||
31,0.2734068655042204,0.25958787047274295
|
||||
32,0.26668692055998694,0.2620577633391267
|
||||
33,0.27190541883537933,0.2677665846189286
|
||||
34,0.2737953942290021,0.2698493206362974
|
||||
35,0.2792490491212266,0.26871523120875485
|
||||
36,0.285255446778457,0.2706224838853019
|
||||
37,0.28791640737048424,0.27443698923839605
|
||||
38,0.279340819685821,0.27814176822064324
|
||||
39,0.27183941054653343,0.27606460269195954
|
||||
40,0.2763117563875985,0.27307694582354536
|
||||
41,0.28090687018324295,0.27681213230462487
|
||||
42,0.2823757748346625,0.2763401507651925
|
||||
43,0.284106939165533,0.2905756087008992
|
||||
44,0.28892051024774673,0.2901305465431984
|
||||
45,0.31067801214219776,0.2866682178814574
|
||||
46,0.33271255209429573,0.2748147271365624
|
||||
47,0.33809240352772785,0.2824570955002154
|
||||
48,0.342466005901906,0.28532199483886955
|
||||
49,0.3472678571815208,0.3194897125271686
|
||||
50,0.3494143489069156,0.32227149882332984
|
||||
51,0.3553262507378467,0.32507315719109064
|
||||
52,0.3577437943048381,0.3483927215781152
|
||||
53,0.36265544445738723,0.35116863658039477
|
||||
54,0.3652777392559547,0.35397195095412226
|
||||
55,0.3635299110583668,0.35681293946359083
|
||||
56,0.36657235082485046,0.35970774816697515
|
||||
57,0.369801843033111,0.36268328716123316
|
||||
58,0.373236109725014,0.365787994620889
|
||||
59,0.3768951711426859,0.36911874689499113
|
||||
60,0.38080172755114144,0.3899012348681259
|
||||
61,0.38498161884368615,0.3925538526988892
|
||||
62,0.38946438071227807,0.3952064705296526
|
||||
63,0.39428392137375445,0.40051170619117926
|
||||
64,0.4003771346837245,0.4003435986724249
|
||||
|
|
|
|
@ -0,0 +1,127 @@
|
|||
,algorithm,k,min,mean,max
|
||||
0,hierarchical,2,1,53.0,105
|
||||
1,k_means,2,1,53.0,105
|
||||
2,hierarchical,3,1,35.333333333333336,104
|
||||
3,k_means,3,1,35.333333333333336,104
|
||||
4,hierarchical,4,1,26.5,102
|
||||
5,k_means,4,1,26.5,102
|
||||
6,hierarchical,5,1,21.2,102
|
||||
7,k_means,5,1,21.2,101
|
||||
8,hierarchical,6,1,17.666666666666668,99
|
||||
9,k_means,6,1,17.666666666666668,99
|
||||
10,hierarchical,7,1,15.142857142857142,98
|
||||
11,k_means,7,1,15.142857142857142,98
|
||||
12,hierarchical,8,1,13.25,96
|
||||
13,k_means,8,1,13.25,91
|
||||
14,hierarchical,9,1,11.777777777777779,96
|
||||
15,k_means,9,1,11.777777777777779,90
|
||||
16,hierarchical,10,1,10.6,95
|
||||
17,k_means,10,1,10.6,86
|
||||
18,hierarchical,11,1,9.636363636363637,94
|
||||
19,k_means,11,1,9.636363636363637,84
|
||||
20,hierarchical,12,1,8.833333333333334,93
|
||||
21,k_means,12,1,8.833333333333334,82
|
||||
22,hierarchical,13,1,8.153846153846153,91
|
||||
23,k_means,13,1,8.153846153846153,81
|
||||
24,hierarchical,14,1,7.571428571428571,91
|
||||
25,k_means,14,1,7.571428571428571,80
|
||||
26,hierarchical,15,1,7.066666666666666,83
|
||||
27,k_means,15,1,7.066666666666666,76
|
||||
28,hierarchical,16,1,6.625,83
|
||||
29,k_means,16,1,6.625,75
|
||||
30,hierarchical,17,1,6.235294117647059,78
|
||||
31,k_means,17,1,6.235294117647059,74
|
||||
32,hierarchical,18,1,5.888888888888889,78
|
||||
33,k_means,18,1,5.888888888888889,73
|
||||
34,hierarchical,19,1,5.578947368421052,78
|
||||
35,k_means,19,1,5.578947368421052,71
|
||||
36,hierarchical,20,1,5.3,71
|
||||
37,k_means,20,1,5.3,70
|
||||
38,hierarchical,21,1,5.0476190476190474,68
|
||||
39,k_means,21,1,5.0476190476190474,69
|
||||
40,hierarchical,22,1,4.818181818181818,68
|
||||
41,k_means,22,1,4.818181818181818,65
|
||||
42,hierarchical,23,1,4.608695652173913,68
|
||||
43,k_means,23,1,4.608695652173913,65
|
||||
44,hierarchical,24,1,4.416666666666667,64
|
||||
45,k_means,24,1,4.416666666666667,64
|
||||
46,hierarchical,25,1,4.24,64
|
||||
47,k_means,25,1,4.24,62
|
||||
48,hierarchical,26,1,4.076923076923077,64
|
||||
49,k_means,26,1,4.076923076923077,61
|
||||
50,hierarchical,27,1,3.925925925925926,64
|
||||
51,k_means,27,1,3.925925925925926,60
|
||||
52,hierarchical,28,1,3.7857142857142856,63
|
||||
53,k_means,28,1,3.7857142857142856,55
|
||||
54,hierarchical,29,1,3.6551724137931036,63
|
||||
55,k_means,29,1,3.6551724137931036,55
|
||||
56,hierarchical,30,1,3.533333333333333,63
|
||||
57,k_means,30,1,3.533333333333333,54
|
||||
58,hierarchical,31,1,3.4193548387096775,63
|
||||
59,k_means,31,1,3.4193548387096775,54
|
||||
60,hierarchical,32,1,3.3125,52
|
||||
61,k_means,32,1,3.3125,54
|
||||
62,hierarchical,33,1,3.212121212121212,52
|
||||
63,k_means,33,1,3.212121212121212,53
|
||||
64,hierarchical,34,1,3.1176470588235294,52
|
||||
65,k_means,34,1,3.1176470588235294,52
|
||||
66,hierarchical,35,1,3.0285714285714285,52
|
||||
67,k_means,35,1,3.0285714285714285,51
|
||||
68,hierarchical,36,1,2.9444444444444446,52
|
||||
69,k_means,36,1,2.9444444444444446,50
|
||||
70,hierarchical,37,1,2.864864864864865,52
|
||||
71,k_means,37,1,2.864864864864865,50
|
||||
72,hierarchical,38,1,2.789473684210526,51
|
||||
73,k_means,38,1,2.789473684210526,50
|
||||
74,hierarchical,39,1,2.717948717948718,51
|
||||
75,k_means,39,1,2.717948717948718,50
|
||||
76,hierarchical,40,1,2.65,51
|
||||
77,k_means,40,1,2.65,49
|
||||
78,hierarchical,41,1,2.5853658536585367,51
|
||||
79,k_means,41,1,2.5853658536585367,48
|
||||
80,hierarchical,42,1,2.5238095238095237,51
|
||||
81,k_means,42,1,2.5238095238095237,47
|
||||
82,hierarchical,43,1,2.4651162790697674,47
|
||||
83,k_means,43,1,2.4651162790697674,47
|
||||
84,hierarchical,44,1,2.409090909090909,47
|
||||
85,k_means,44,1,2.409090909090909,46
|
||||
86,hierarchical,45,1,2.3555555555555556,47
|
||||
87,k_means,45,1,2.3555555555555556,44
|
||||
88,hierarchical,46,1,2.3043478260869565,46
|
||||
89,k_means,46,1,2.3043478260869565,42
|
||||
90,hierarchical,47,1,2.25531914893617,25
|
||||
91,k_means,47,1,2.25531914893617,41
|
||||
92,hierarchical,48,1,2.2083333333333335,25
|
||||
93,k_means,48,1,2.2083333333333335,41
|
||||
94,hierarchical,49,1,2.163265306122449,25
|
||||
95,k_means,49,1,2.163265306122449,41
|
||||
96,hierarchical,50,1,2.12,25
|
||||
97,k_means,50,1,2.12,40
|
||||
98,hierarchical,51,1,2.0784313725490198,25
|
||||
99,k_means,51,1,2.0784313725490198,39
|
||||
100,hierarchical,52,1,2.0384615384615383,25
|
||||
101,k_means,52,1,2.0384615384615383,38
|
||||
102,hierarchical,53,1,2.0,25
|
||||
103,k_means,53,1,2.0,38
|
||||
104,hierarchical,54,1,1.962962962962963,25
|
||||
105,k_means,54,1,1.962962962962963,37
|
||||
106,hierarchical,55,1,1.9272727272727272,25
|
||||
107,k_means,55,1,1.9272727272727272,35
|
||||
108,hierarchical,56,1,1.8928571428571428,25
|
||||
109,k_means,56,1,1.8928571428571428,34
|
||||
110,hierarchical,57,1,1.8596491228070176,25
|
||||
111,k_means,57,1,1.8596491228070176,33
|
||||
112,hierarchical,58,1,1.8275862068965518,25
|
||||
113,k_means,58,1,1.8275862068965518,32
|
||||
114,hierarchical,59,1,1.7966101694915255,25
|
||||
115,k_means,59,1,1.7966101694915255,31
|
||||
116,hierarchical,60,1,1.7666666666666666,25
|
||||
117,k_means,60,1,1.7666666666666666,30
|
||||
118,hierarchical,61,1,1.7377049180327868,25
|
||||
119,k_means,61,1,1.7377049180327868,29
|
||||
120,hierarchical,62,1,1.7096774193548387,25
|
||||
121,k_means,62,1,1.7096774193548387,28
|
||||
122,hierarchical,63,1,1.6825396825396826,25
|
||||
123,k_means,63,1,1.6825396825396826,27
|
||||
124,hierarchical,64,1,1.65625,25
|
||||
125,k_means,64,1,1.65625,27
|
|
Binary file not shown.
After (image error) Size: 100 KiB |
|
@ -1,64 +1,64 @@
|
|||
,k_means,hierarchical
|
||||
2,0.6855584100867681,0.6979818296524081
|
||||
3,0.6658312390685782,0.5363440260613704
|
||||
2,0.6979818296524081,0.6855584100867681
|
||||
3,0.5363440260613704,0.6658312390685782
|
||||
4,0.5447405755407478,0.5447405755407478
|
||||
5,0.49469855877597974,0.4950944104608897
|
||||
6,0.49629928069605667,0.3167619075077442
|
||||
7,0.4937183177275972,0.3273304877495634
|
||||
8,0.4903327662796836,0.16989336921679118
|
||||
9,0.33791118198002373,0.17626101482318196
|
||||
10,0.2667320598048964,0.19790344008120894
|
||||
11,0.2629948976926796,0.1943827895435377
|
||||
12,0.25965211932671445,0.20556562870341602
|
||||
13,0.26164323937367595,0.20144552653966163
|
||||
14,0.25806076142240403,0.22548403695669203
|
||||
15,0.26997893998401756,0.22918194758667895
|
||||
16,0.27256451459055664,0.2404290571765335
|
||||
17,0.2608837912623233,0.2345015455494567
|
||||
18,0.241790230179569,0.2390816398182416
|
||||
19,0.24484234464495422,0.24094820256010968
|
||||
20,0.2361050155539465,0.2435369787081999
|
||||
21,0.23692336175194548,0.262463283756636
|
||||
22,0.23946566771940794,0.2742864390420934
|
||||
23,0.24594283942153175,0.2979619533428987
|
||||
24,0.24734609860636583,0.29936015461670856
|
||||
25,0.2372755932074588,0.30224440986202594
|
||||
26,0.24082497341896647,0.30600924875137986
|
||||
27,0.24547723657004195,0.3147862783718484
|
||||
28,0.2503460498700128,0.31887407003386015
|
||||
29,0.26565769288673047,0.3204052924706567
|
||||
30,0.2951470761811464,0.3106572702067674
|
||||
31,0.30158824153259317,0.31330774028648145
|
||||
32,0.3180875184494547,0.33210454757827634
|
||||
33,0.32604023717225655,0.340503634089749
|
||||
34,0.3162922149083673,0.33568440892081625
|
||||
35,0.31716183472339093,0.3545992807283562
|
||||
36,0.3214298482703343,0.3575403386841057
|
||||
37,0.32681546873349715,0.36503026576341707
|
||||
38,0.32732304963529885,0.3738835074795801
|
||||
39,0.32990135488218114,0.3929681262996284
|
||||
40,0.32432743478528314,0.38848432563159185
|
||||
41,0.3198436341172465,0.39163178463382314
|
||||
42,0.32643375049241685,0.3860578645369252
|
||||
43,0.32203761977337186,0.4053389048253979
|
||||
44,0.3346466456087704,0.4217820126417848
|
||||
45,0.33223356867673165,0.4257244699851672
|
||||
46,0.33057959050289126,0.42988772845220063
|
||||
47,0.34588619420359423,0.4342738007458362
|
||||
48,0.3328354412937589,0.4416623097378058
|
||||
49,0.33565227636252953,0.4426712673092602
|
||||
50,0.35908519690010676,0.4440749104193141
|
||||
51,0.3619904717574287,0.4456199850626709
|
||||
52,0.36212543071422687,0.45074758794403463
|
||||
53,0.3651696447072414,0.472336181881003
|
||||
54,0.3982691624564969,0.47730382490643575
|
||||
55,0.4013572084387477,0.454200815600248
|
||||
56,0.4246326548929088,0.4565666319605046
|
||||
57,0.44773608146478383,0.4522750157266781
|
||||
58,0.45157671443203573,0.4549088773042353
|
||||
59,0.4563204517812888,0.46145616231522185
|
||||
60,0.46300257860702615,0.4641242358826516
|
||||
61,0.4967563269695634,0.47006659021417746
|
||||
62,0.4994732063134373,0.47352022985333136
|
||||
63,0.49518159007961093,0.4775136217473302
|
||||
64,0.4872143751031807,0.4821848224907804
|
||||
5,0.4950944104608897,0.49469855877597974
|
||||
6,0.3167619075077442,0.49629928069605667
|
||||
7,0.3273304877495634,0.4937183177275972
|
||||
8,0.16989336921679118,0.4903327662796836
|
||||
9,0.17626101482318196,0.33791118198002373
|
||||
10,0.19790344008120894,0.2667320598048964
|
||||
11,0.1943827895435377,0.2629948976926796
|
||||
12,0.20556562870341602,0.25965211932671445
|
||||
13,0.20144552653966163,0.26164323937367595
|
||||
14,0.22548403695669203,0.25806076142240403
|
||||
15,0.22918194758667895,0.26997893998401756
|
||||
16,0.2404290571765335,0.27256451459055664
|
||||
17,0.2345015455494567,0.2608837912623233
|
||||
18,0.2390816398182416,0.241790230179569
|
||||
19,0.24094820256010968,0.24484234464495422
|
||||
20,0.2435369787081999,0.2361050155539465
|
||||
21,0.262463283756636,0.23692336175194548
|
||||
22,0.2742864390420934,0.23946566771940794
|
||||
23,0.2979619533428987,0.24594283942153175
|
||||
24,0.29936015461670856,0.24734609860636583
|
||||
25,0.30224440986202594,0.2372755932074588
|
||||
26,0.30600924875137986,0.24082497341896647
|
||||
27,0.3147862783718484,0.24547723657004195
|
||||
28,0.31887407003386015,0.2503460498700128
|
||||
29,0.3204052924706567,0.26565769288673047
|
||||
30,0.3106572702067674,0.2951470761811464
|
||||
31,0.31330774028648145,0.30158824153259317
|
||||
32,0.33210454757827634,0.3180875184494547
|
||||
33,0.340503634089749,0.32604023717225655
|
||||
34,0.33568440892081625,0.3162922149083673
|
||||
35,0.3545992807283562,0.31716183472339093
|
||||
36,0.3575403386841057,0.3214298482703343
|
||||
37,0.36503026576341707,0.32681546873349715
|
||||
38,0.3738835074795801,0.32732304963529885
|
||||
39,0.3929681262996284,0.32990135488218114
|
||||
40,0.38848432563159185,0.32432743478528314
|
||||
41,0.39163178463382314,0.3198436341172465
|
||||
42,0.3860578645369252,0.32643375049241685
|
||||
43,0.4053389048253979,0.32203761977337186
|
||||
44,0.4217820126417848,0.3346466456087704
|
||||
45,0.4257244699851672,0.33223356867673165
|
||||
46,0.42988772845220063,0.33057959050289126
|
||||
47,0.4342738007458362,0.34588619420359423
|
||||
48,0.4416623097378058,0.3328354412937589
|
||||
49,0.4426712673092602,0.33565227636252953
|
||||
50,0.4440749104193141,0.35908519690010676
|
||||
51,0.4456199850626709,0.3619904717574287
|
||||
52,0.45074758794403463,0.36212543071422687
|
||||
53,0.472336181881003,0.3651696447072414
|
||||
54,0.47730382490643575,0.3982691624564969
|
||||
55,0.454200815600248,0.4013572084387477
|
||||
56,0.4565666319605046,0.4246326548929088
|
||||
57,0.4522750157266781,0.44773608146478383
|
||||
58,0.4549088773042353,0.45157671443203573
|
||||
59,0.46145616231522185,0.4563204517812888
|
||||
60,0.4641242358826516,0.46300257860702615
|
||||
61,0.47006659021417746,0.4967563269695634
|
||||
62,0.47352022985333136,0.4994732063134373
|
||||
63,0.4775136217473302,0.49518159007961093
|
||||
64,0.4821848224907804,0.4872143751031807
|
||||
|
|
|
127
clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.csv
Normal file
127
clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.csv
Normal file
|
@ -0,0 +1,127 @@
|
|||
,algorithm,k,min,mean,max
|
||||
0,hierarchical,2,2,54.0,106
|
||||
1,k_means,2,1,54.0,107
|
||||
2,hierarchical,3,1,36.0,106
|
||||
3,k_means,3,1,36.0,103
|
||||
4,hierarchical,4,1,27.0,102
|
||||
5,k_means,4,1,27.0,102
|
||||
6,hierarchical,5,1,21.6,101
|
||||
7,k_means,5,1,21.6,102
|
||||
8,hierarchical,6,1,18.0,100
|
||||
9,k_means,6,1,18.0,93
|
||||
10,hierarchical,7,1,15.428571428571429,99
|
||||
11,k_means,7,1,15.428571428571429,91
|
||||
12,hierarchical,8,1,13.5,99
|
||||
13,k_means,8,1,13.5,71
|
||||
14,hierarchical,9,1,12.0,90
|
||||
15,k_means,9,1,12.0,68
|
||||
16,hierarchical,10,1,10.8,80
|
||||
17,k_means,10,1,10.8,62
|
||||
18,hierarchical,11,1,9.818181818181818,80
|
||||
19,k_means,11,1,9.818181818181818,62
|
||||
20,hierarchical,12,1,9.0,80
|
||||
21,k_means,12,1,9.0,61
|
||||
22,hierarchical,13,1,8.307692307692308,79
|
||||
23,k_means,13,1,8.307692307692308,63
|
||||
24,hierarchical,14,1,7.714285714285714,77
|
||||
25,k_means,14,1,7.714285714285714,57
|
||||
26,hierarchical,15,1,7.2,77
|
||||
27,k_means,15,1,7.2,56
|
||||
28,hierarchical,16,1,6.75,76
|
||||
29,k_means,16,1,6.75,55
|
||||
30,hierarchical,17,1,6.352941176470588,76
|
||||
31,k_means,17,1,6.352941176470588,55
|
||||
32,hierarchical,18,1,6.0,65
|
||||
33,k_means,18,1,6.0,54
|
||||
34,hierarchical,19,1,5.684210526315789,65
|
||||
35,k_means,19,1,5.684210526315789,54
|
||||
36,hierarchical,20,1,5.4,65
|
||||
37,k_means,20,1,5.4,53
|
||||
38,hierarchical,21,1,5.142857142857143,65
|
||||
39,k_means,21,1,5.142857142857143,53
|
||||
40,hierarchical,22,1,4.909090909090909,64
|
||||
41,k_means,22,1,4.909090909090909,51
|
||||
42,hierarchical,23,1,4.695652173913044,64
|
||||
43,k_means,23,1,4.695652173913044,47
|
||||
44,hierarchical,24,1,4.5,64
|
||||
45,k_means,24,1,4.5,47
|
||||
46,hierarchical,25,1,4.32,34
|
||||
47,k_means,25,1,4.32,46
|
||||
48,hierarchical,26,1,4.153846153846154,34
|
||||
49,k_means,26,1,4.153846153846154,45
|
||||
50,hierarchical,27,1,4.0,34
|
||||
51,k_means,27,1,4.0,42
|
||||
52,hierarchical,28,1,3.857142857142857,34
|
||||
53,k_means,28,1,3.857142857142857,41
|
||||
54,hierarchical,29,1,3.7241379310344827,34
|
||||
55,k_means,29,1,3.7241379310344827,41
|
||||
56,hierarchical,30,1,3.6,34
|
||||
57,k_means,30,1,3.6,41
|
||||
58,hierarchical,31,1,3.4838709677419355,34
|
||||
59,k_means,31,1,3.4838709677419355,40
|
||||
60,hierarchical,32,1,3.375,34
|
||||
61,k_means,32,1,3.375,38
|
||||
62,hierarchical,33,1,3.272727272727273,34
|
||||
63,k_means,33,1,3.272727272727273,36
|
||||
64,hierarchical,34,1,3.176470588235294,34
|
||||
65,k_means,34,1,3.176470588235294,36
|
||||
66,hierarchical,35,1,3.085714285714286,34
|
||||
67,k_means,35,1,3.085714285714286,34
|
||||
68,hierarchical,36,1,3.0,34
|
||||
69,k_means,36,1,3.0,33
|
||||
70,hierarchical,37,1,2.918918918918919,34
|
||||
71,k_means,37,1,2.918918918918919,31
|
||||
72,hierarchical,38,1,2.8421052631578947,34
|
||||
73,k_means,38,1,2.8421052631578947,31
|
||||
74,hierarchical,39,1,2.769230769230769,33
|
||||
75,k_means,39,1,2.769230769230769,29
|
||||
76,hierarchical,40,1,2.7,33
|
||||
77,k_means,40,1,2.7,29
|
||||
78,hierarchical,41,1,2.6341463414634148,33
|
||||
79,k_means,41,1,2.6341463414634148,28
|
||||
80,hierarchical,42,1,2.5714285714285716,33
|
||||
81,k_means,42,1,2.5714285714285716,28
|
||||
82,hierarchical,43,1,2.511627906976744,33
|
||||
83,k_means,43,1,2.511627906976744,26
|
||||
84,hierarchical,44,1,2.4545454545454546,33
|
||||
85,k_means,44,1,2.4545454545454546,26
|
||||
86,hierarchical,45,1,2.4,33
|
||||
87,k_means,45,1,2.4,25
|
||||
88,hierarchical,46,1,2.347826086956522,33
|
||||
89,k_means,46,1,2.347826086956522,24
|
||||
90,hierarchical,47,1,2.297872340425532,33
|
||||
91,k_means,47,1,2.297872340425532,23
|
||||
92,hierarchical,48,1,2.25,21
|
||||
93,k_means,48,1,2.25,23
|
||||
94,hierarchical,49,1,2.204081632653061,20
|
||||
95,k_means,49,1,2.204081632653061,23
|
||||
96,hierarchical,50,1,2.16,18
|
||||
97,k_means,50,1,2.16,22
|
||||
98,hierarchical,51,1,2.1176470588235294,17
|
||||
99,k_means,51,1,2.1176470588235294,21
|
||||
100,hierarchical,52,1,2.076923076923077,16
|
||||
101,k_means,52,1,2.076923076923077,20
|
||||
102,hierarchical,53,1,2.0377358490566038,16
|
||||
103,k_means,53,1,2.0377358490566038,18
|
||||
104,hierarchical,54,1,2.0,16
|
||||
105,k_means,54,1,2.0,18
|
||||
106,hierarchical,55,1,1.9636363636363636,16
|
||||
107,k_means,55,1,1.9636363636363636,19
|
||||
108,hierarchical,56,1,1.9285714285714286,16
|
||||
109,k_means,56,1,1.9285714285714286,18
|
||||
110,hierarchical,57,1,1.894736842105263,16
|
||||
111,k_means,57,1,1.894736842105263,18
|
||||
112,hierarchical,58,1,1.8620689655172413,16
|
||||
113,k_means,58,1,1.8620689655172413,18
|
||||
114,hierarchical,59,1,1.8305084745762712,16
|
||||
115,k_means,59,1,1.8305084745762712,18
|
||||
116,hierarchical,60,1,1.8,16
|
||||
117,k_means,60,1,1.8,17
|
||||
118,hierarchical,61,1,1.7704918032786885,16
|
||||
119,k_means,61,1,1.7704918032786885,17
|
||||
120,hierarchical,62,1,1.7419354838709677,16
|
||||
121,k_means,62,1,1.7419354838709677,17
|
||||
122,hierarchical,63,1,1.7142857142857142,16
|
||||
123,k_means,63,1,1.7142857142857142,17
|
||||
124,hierarchical,64,1,1.6875,16
|
||||
125,k_means,64,1,1.6875,17
|
|
BIN
clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.png
Normal file
BIN
clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.png
Normal file
Binary file not shown.
After (image error) Size: 102 KiB |
|
@ -25,10 +25,16 @@ def intrapairs(path: str) -> set[set[str, str]]:
|
|||
|
||||
def main():
|
||||
filelist = glob.glob(IN_DIR + '/*_groundtruth.csv')
|
||||
df_table = pd.DataFrame(columns=pd.MultiIndex.from_tuples([
|
||||
('KMeans', 'Precision'),
|
||||
('KMeans', 'Recall'),
|
||||
('Agglomerative', 'Precision'),
|
||||
('Agglomerative', 'Recall')]))
|
||||
df_table.index.name = 'Class Name'
|
||||
|
||||
for f in filelist:
|
||||
clazz_name = os.path.basename(f)
|
||||
clazz_name = clazz_name[:clazz_name.rfind('_groundtruth.csv')]
|
||||
print(clazz_name)
|
||||
|
||||
ground_pairs = intrapairs(f)
|
||||
for method in ['kmeans', 'hierarchical']:
|
||||
|
@ -39,10 +45,15 @@ def main():
|
|||
precision = n_common / len(cluster_pairs)
|
||||
recall = n_common / len(ground_pairs)
|
||||
|
||||
print(method + " precision: " + str(precision))
|
||||
print(method + " recall: " + str(recall))
|
||||
algo = 'KMeans' if method == 'kmeans' else 'Agglomerative'
|
||||
|
||||
df_table.loc[clazz_name, [(algo, 'Precision'), (algo, 'Recall')]] = [
|
||||
str(round(precision * 100, 2)) + '%',
|
||||
str(round(recall * 100, 2)) + '%'
|
||||
]
|
||||
|
||||
print()
|
||||
df_table.columns = [x[0] + ' ' + x[1] for x in df_table.columns]
|
||||
print(df_table.to_markdown())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
8
report/build.sh
Executable file
8
report/build.sh
Executable file
|
@ -0,0 +1,8 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
|
||||
|
||||
cd "$SCRIPT_DIR"
|
||||
pandoc main.md -o main.pdf
|
259
report/main.md
259
report/main.md
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
author: Claudio Maggioni
|
||||
title: Information Modelling & Analysis -- Project 1
|
||||
geometry: margin=2.5cm,bottom=3cm
|
||||
geometry: margin=2cm,bottom=3cm
|
||||
---
|
||||
|
||||
<!--
|
||||
|
@ -9,59 +9,62 @@ The following shows a minimal submission report for project 1. If you
|
|||
choose to use this template, replace all template instructions (the
|
||||
yellow bits) with your own values. In addition, for any section, if
|
||||
**and only if** anything was unclear or warnings were raised by the
|
||||
code, and you had to take assumptions about the correct implementation
|
||||
(e.g., about details of a metric), describe your assumptions in one or
|
||||
two sentences.
|
||||
code, and you had to take assumptions about the correct implementation (e.g.,
|
||||
about details of a metric), describe your assumptions in one or two sentences.
|
||||
|
||||
You may - at your own risk - also choose not to use this template. As
|
||||
long as your submission is a latex-generated, English PDF containing all
|
||||
expected info, you'll be fine.
|
||||
-->
|
||||
You may - at your own risk - also choose not to use this template. As long as
|
||||
your submission is a latex-generated, English PDF containing all expected info,
|
||||
you'll be fine. -->
|
||||
|
||||
# Code Repository
|
||||
|
||||
The code and result files part of this submission can be found at:
|
||||
|
||||
::: center
|
||||
Repository: \url{https://github.com/infoMA2023/project-01-god-classes-maggicl}
|
||||
::: center Repository:
|
||||
\url{https://github.com/infoMA2023/project-01-god-classes-maggicl}
|
||||
|
||||
Commit ID: **TBD**
|
||||
:::
|
||||
Commit ID: **TBD** :::
|
||||
|
||||
# Data Pre-Processing
|
||||
|
||||
## God Classes
|
||||
|
||||
The first part of the project requires to label some classes of the _Xerces_
|
||||
project as "God classes" based on the number of methods each class has.
|
||||
The first part of the project requires to label some classes of the _Xerces_
|
||||
project as "God classes" based on the number of methods each class has. From
|
||||
here onwards the Java package prefix `org.apache.xerces` is omitted when discussing
|
||||
fully qualified domain names of classes for sake of brevity.
|
||||
|
||||
Specifically, I label "God classes" the classes that have a number of methods
|
||||
six times the standard deviation above the the mean number of methods, i.e. where
|
||||
the condition
|
||||
six times the standard deviation above the the mean number of methods, i.e.
|
||||
where the condition
|
||||
|
||||
$$|M(C)| > \mu(M) + 6\sigma(M)$$
|
||||
|
||||
holds.
|
||||
|
||||
To scan and compute the number of methods of each class I use the Python library `javalang`, which implements the Java AST and parser. The Python script
|
||||
`./find_god_classes.py` uses this library to parse each file in the project and
|
||||
compute the number of methods of each class. Note that only non-constructor methods are counted (specifically the code counts the number of `method` nodes in each `ClassDeclaration` node).
|
||||
To scan and compute the number of methods of each class I use the Python library
|
||||
`javalang`, which implements the Java AST and parser. The Python script
|
||||
`./find_god_classes.py` uses this library to parse each file in the project and
|
||||
compute the number of methods of each class. Note that only non-constructor
|
||||
methods are counted (specifically the code counts the number of `method` nodes
|
||||
in each `ClassDeclaration` node).
|
||||
|
||||
Then, the script computes mean and standard
|
||||
deviation of the number of methods and filters the list of classes according to the
|
||||
condition described above. The file `god_classes/god_classes.csv` then is outputted
|
||||
listing all the god classes found.
|
||||
Then, the script computes mean and standard deviation of the number of methods
|
||||
and filters the list of classes according to the condition described above. The
|
||||
file `god_classes/god_classes.csv` then is outputted listing all the god classes
|
||||
found.
|
||||
|
||||
The god classes I identified, and their corresponding number of methods
|
||||
can be found in Table [1](#tab:god_classes){reference-type="ref"
|
||||
The god classes I identified, and their corresponding number of methods can be
|
||||
found in Table [1](#tab:god_classes){reference-type="ref"
|
||||
reference="tab:god_classes"}.
|
||||
|
||||
::: {#tab:god_classes}
|
||||
| **Class Name** | **# Methods** |
|
||||
|:------------------------------------------------|------------:|
|
||||
| org.apache.xerces.impl.xs.traversers.XSDHandler | 118 |
|
||||
| org.apache.xerces.impl.dtd.DTDGrammar | 101 |
|
||||
| org.apache.xerces.xinclude.XIncludeHandler | 116 |
|
||||
| org.apache.xerces.dom.CoreDocumentImpl | 125 |
|
||||
| impl.xs.traversers.XSDHandler | 118 |
|
||||
| impl.dtd.DTDGrammar | 101 |
|
||||
| xinclude.XIncludeHandler | 116 |
|
||||
| dom.CoreDocumentImpl | 125 |
|
||||
|
||||
: Identified God Classes
|
||||
:::
|
||||
|
@ -70,84 +73,198 @@ reference="tab:god_classes"}.
|
|||
## Feature Vectors
|
||||
|
||||
In this part of the project we produce the feature vectors used to later cluster
|
||||
the methods of each God class into separate clusters. We produce one feature method per
|
||||
non-constructor Java method in each god class.
|
||||
the methods of each God class into separate clusters. We produce one feature
|
||||
method per non-constructor Java method in each god class.
|
||||
|
||||
The columns of each vector represent
|
||||
fields and methods referenced by each method, i.e. fields and methods actively used by the method in their method's body.
|
||||
The columns of each vector represent fields and methods referenced by each
|
||||
method, i.e. fields and methods actively used by the method in their method's
|
||||
body.
|
||||
|
||||
When analyzing references to fields, additional constraints need to be specified to handle edge cases.
|
||||
Namely, a field's property may be referenced (e.g. an access to array `a` may fetch its `length` property, i.e. `a.length`). In this
|
||||
cases I consider the qualifier (i.e. the field itself, `a`) itself and not its property. When the qualifier is a class (i.e.
|
||||
the code references a property of another class, e.g. `Integer.MAX_VALUE`) we consider the class name itself (i.e. `Integer`) and not
|
||||
the name of the property. Should the qualifier be a subproperty itself (e.g. in `a.b.c`, where `a.b` would be the qualifier according to `javalang`)
|
||||
When analyzing references to fields, additional constraints need to be specified
|
||||
to handle edge cases. Namely, a field's property may be referenced (e.g. an
|
||||
access to array `a` may fetch its `length` property, i.e. `a.length`). In this
|
||||
cases I consider the qualifier (i.e. the field itself, `a`) itself and not its
|
||||
property. When the qualifier is a class (i.e. the code references a property of
|
||||
another class, e.g. `Integer.MAX_VALUE`) we consider the class name itself (i.e.
|
||||
`Integer`) and not the name of the property. Should the qualifier be a
|
||||
subproperty itself (e.g. in `a.b.c`, where `a.b` would be the qualifier
|
||||
according to `javalang`)
|
||||
|
||||
For methods, I only consider calls to methods of the class itself where the qualifier is unspecified or `this`. Calls to parent methods
|
||||
(i.e. calls like `super.something()`) are not considered.
|
||||
For methods, I only consider calls to methods of the class itself where the
|
||||
qualifier is unspecified or `this`. Calls to parent methods (i.e. calls like
|
||||
`super.something()`) are not considered.
|
||||
|
||||
The feature vector extraction phase is performed by the Python script `extract_feature_vectors.py`. The script takes `god_classes/god_classes.csv` as input
|
||||
and loads the AST of each class listed in it. Then, a list of all the fields and methods in the class is built, and each method is scanned to see which fields
|
||||
and methods it references in its body according to the previously described rules. Then, a CSV per class is built storing all feature vectors. Each file has a name matching to the FQDN (Fully-qualified domain name) of the class. Each CSV row refers to a method in the class, and each CSV column refers to a field, method or referenced class. A cell has the value of 1 when the method of that row references the field, method or class marked by that column, and it has the value 0 otherwise. Columns with only zeros are omitted.
|
||||
The feature vector extraction phase is performed by the Python script
|
||||
`extract_feature_vectors.py`. The script takes `god_classes/god_classes.csv` as
|
||||
input and loads the AST of each class listed in it. Then, a list of all the
|
||||
fields and methods in the class is built, and each method is scanned to see
|
||||
which fields and methods it references in its body according to the previously
|
||||
described rules. Then, a CSV per class is built storing all feature vectors.
|
||||
Each file has a name matching to the FQDN (Fully-qualified domain name) of the
|
||||
class. Each CSV row refers to a method in the class, and each CSV column refers
|
||||
to a field, method or referenced class. A cell has the value of 1 when the
|
||||
method of that row references the field, method or class marked by that column,
|
||||
and it has the value 0 otherwise. Columns with only zeros are omitted.
|
||||
|
||||
Table [2](#tab:feat_vec){reference-type="ref" reference="tab:feat_vec"}
|
||||
shows aggregate numbers regarding the extracted feature vectors for the
|
||||
god classes. Note that the number of attributes refers to the number of fields, methods or classes actually references (i.e. the number of columns after omission of 0s).
|
||||
Table [2](#tab:feat_vec){reference-type="ref" reference="tab:feat_vec"} shows
|
||||
aggregate numbers regarding the extracted feature vectors for the god classes.
|
||||
Note that the number of attributes refers to the number of fields, methods or
|
||||
classes actually references (i.e. the number of columns after omission of 0s).
|
||||
|
||||
::: {#tab:feat_vec}
|
||||
| **Class Name** | **# Feature Vectors** | **# Attributes\*** |
|
||||
|:------------------------------------------------|----------------------:|-----------------:|
|
||||
| org.apache.xerces.impl.xs.traversers.XSDHandler | 106 | 183 |
|
||||
| org.apache.xerces.impl.dtd.DTDGrammar | 91 | 106 |
|
||||
| org.apache.xerces.xinclude.XIncludeHandler | 108 | 143 |
|
||||
| org.apache.xerces.dom.CoreDocumentImpl | 117 | 63 |
|
||||
| impl.xs.traversers.XSDHandler | 106 | 183 |
|
||||
| impl.dtd.DTDGrammar | 91 | 106 |
|
||||
| xinclude.XIncludeHandler | 108 | 143 |
|
||||
| dom.CoreDocumentImpl | 117 | 63 |
|
||||
|
||||
: Feature vector summary (\*= used at least once)
|
||||
:::
|
||||
|
||||
# Clustering {#sec:clustering}
|
||||
|
||||
In this section I covering the techniques to cluster the methods of each god
|
||||
class. The project aims to use KMeans clustering and agglomerative hierarchical
|
||||
clustering to group these methods toghether in cohesive units which could be
|
||||
potentially refactored out of the god class they belong to.
|
||||
|
||||
## Algorithm Configurations
|
||||
|
||||
Report/comment the algorithm configurations (distance function, linkage
|
||||
rule, etc.). You may do so in any form you feel suited, but a short
|
||||
paragraph of text is probably sufficient.
|
||||
To perform KMeans clustering, I use the `cluster.KMeans` Scikit-Learn
|
||||
implementation of the algorithm. I use the default parameters: feature vectors
|
||||
are compared with euclidian distance, centroids are used instead of medioids,
|
||||
and the initial centroids are computed with the greedy algorithm `kmeans++`. The
|
||||
random seed is fixed to $0$ to allow for reproducibility between executions of
|
||||
the clustering script.
|
||||
|
||||
To perform Hierarchical clustering, I use the `cluster.AgglomerativeClustering`
|
||||
Scikit-Learn implementation of the algorithm. Again feature vectors are
|
||||
compared with euclidian distance, but as a linkage metric I choose to use
|
||||
complete linkage. As agglomerative clustering is deternministic, no random seed
|
||||
is needed for this algorithm.
|
||||
|
||||
I run the two algorithms for all $k \in [2,65]$, or if less than 65 feature
|
||||
vectors with distinct values are assigned to the god class, the upper bound of
|
||||
$k$ is such value.
|
||||
|
||||
## Testing Various K & Silhouette Scores
|
||||
|
||||
\(1\) Report data about the clusters produced by the two algorithms at
|
||||
various k (#clusters, size of clusters, silhouette scores). You may use
|
||||
any suitable format (table, graph, \...).
|
||||
To find the optimal value of $k$ for both algorithms, the distribution of
|
||||
cluster sizes and silhouette across values of $k$, and to apply the optimal
|
||||
clustering for each god class I run the command:
|
||||
|
||||
\(2\) Briefly comment your results. What is the best configuration, and
|
||||
why? Anything else you observed?
|
||||
```shell
|
||||
./silhouette.py --validate --autorun
|
||||
```
|
||||
|
||||
Feature vectors are read from the `feature_vectors` directory and all the
|
||||
results are stored in the `clustering` directory.
|
||||
|
||||
Figures [1](#fig:xsd){reference-type="ref" reference="fig:xsd"},
|
||||
[2](#fig:dtd){reference-type="ref" reference="fig:dtd"},
|
||||
[3](#fig:xinc){reference-type="ref" reference="fig:xinc"}, and
|
||||
[4](#fig:cimpl){reference-type="ref" reference="fig:cimpl"} show the
|
||||
distributions of cluster sizes for each god class obtained by running the KMeans
|
||||
and agglomerative clustering algorithm as described in the previous sections.
|
||||
|
||||
For all god classes, the mean of number of elements in each cluster
|
||||
exponentially decreases as $k$ increases. Aside the first values of $k$ for
|
||||
class `DTDGrammar` (where it was 2), the minimum cluster size was 1 for all
|
||||
analyzed clusterings. Conversely, the maximum cluster size varies a lot, almost
|
||||
always being monotonically non increasing as $k$ increases, occasionally forming
|
||||
wide plateaus. The silhouette metric distribution instead generally follows a
|
||||
dogleg-like path, sharply decreasing for the first values of $k$ and slowly
|
||||
increasing afterwards $k$. This leads the choice of the optimal $k$ number of
|
||||
clusters for each algorithm to be between really low and really high values.
|
||||
|
||||
The figures also show the distribution of the silhouette metric per algorithm
|
||||
and per value of $k$. The optimal values of $k$ and the respective silhouette
|
||||
values for each implementation are reported in Table
|
||||
[3](#tab:sumup){reference-type="ref" reference="tab:sumup"}.
|
||||
|
||||
From the values we can gather that agglomerative clustering performs overall
|
||||
better than KMeans for the god classes in the project. Almost god classes are
|
||||
optimally clustered with few clusters, with the exception of `CoreDocumentImpl`
|
||||
being optimally clustered with unit clusters. This could indicate higher
|
||||
cohesion between implementation details of the other classes, and lower cohesion
|
||||
in `CoreDocumentImpl` (given the name it would not be surprising if this class
|
||||
plays the role of an utility class of sort, combining lots of implementation
|
||||
details affecting different areas of the code).
|
||||
|
||||
Agglomerative clustering with complete linkage could perform better than KMeans
|
||||
due to a more urgent need for separation rather than cohesion in the classes
|
||||
that were analyzed. Given the high dimensionality of the feature vectures used,
|
||||
and the fact that eucledian distance is used to compare feature vectors, the
|
||||
hyper-space of method features for each god class is likely sparse, with
|
||||
occasional clusters of tightly-knit features. Given the prevailing sparsity,
|
||||
complete linkage could be suitable here since it avoids to agglomerate distant
|
||||
clusters above all.
|
||||
|
||||
![Clustering metrics for class impl.xs.traversers.XSDHandler](../clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_stats.png){#fig:xsd}
|
||||
|
||||
![Clustering metrics for class impl.dtd.DTDGrammar](../clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.png){#fig:dtd}
|
||||
|
||||
![Clustering metrics for class xinclude.XIncludeHandler](../clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.png){#fig:xinc}
|
||||
|
||||
![Clustering metrics for class dom.CoreDocumentImpl](../clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.png){#fig:cimpl}
|
||||
|
||||
::: {#tab:sumup}
|
||||
| **Class Name** | **KMeans K** | **KMeans silhouette** | **Hierarchical K** | **Hierarchical silhouette** |
|
||||
|:------------- --------------|-----------:|--------------------:|-----------------:|--------------------------:|
|
||||
| dom.CoreDocumentImpl | 45 |0.7290 | 45 | 0.7290 |
|
||||
| impl.xs.traversers.XSDHandler | 2 |0.5986 | 3 | 0.5989 |
|
||||
| impl.dtd.DTDGrammar | 58 |0.3980 | 2 | 0.4355 |
|
||||
| xinclude.XIncludeHandler | 2 |0.6980 | 2 | 0.6856 |
|
||||
|
||||
: Optimal hyperparameters and corresponding silhouette metrics for KMeans and
|
||||
Hierarchical clustering algorithm.
|
||||
:::
|
||||
|
||||
# Evaluation
|
||||
|
||||
## Ground Truth
|
||||
|
||||
I computed the ground truth using the command \.... The generated files
|
||||
are checked into the repository with the names \....
|
||||
I computed the ground truth using the Python script `./ground_truth.py` The
|
||||
generated files are checked into the repository with the names
|
||||
`clustering/{className}_groundtruth.csv` where `{className}` is the FQDN of each
|
||||
god class.
|
||||
|
||||
Comment briefly on the strengths & weaknesses of our ground truth.
|
||||
The ground truth in this project is not given but generated according to simple
|
||||
heuristics. Since no inherent structure or labelling from experts exists to
|
||||
group the methods in each god class, the project requires to label methods based
|
||||
on keyword matching whitin each method name. The list of keywords used can be
|
||||
found in `keyword_list.txt`. This approach allows to have a ground truth at all
|
||||
with little computational cost and labelling effort, but it assumes the method
|
||||
name and the chosen keywords are indeed of enough significance to form a
|
||||
meaningful clustering of methods that form refactorable cohesive units of
|
||||
functionality.
|
||||
|
||||
## Precision and Recall
|
||||
|
||||
::: {#tab:eval}
|
||||
---------------- ------------------- -------- ------------- --------
|
||||
**Class Name** **Agglomerative** **K-Means**
|
||||
Prec. Recall Prec. Recall
|
||||
\... \... \... \... \...
|
||||
---------------- ------------------- -------- ------------- --------
|
||||
| **Class Name** | **KMeans Precision** | **KMeans Recall** | **Agglomerative Precision** | **Agglomerative Recall** |
|
||||
|:------------------------------------------------|-------------------:|----------------:|--------------------------:|-----------------------:|
|
||||
| xinclude.XIncludeHandler | 69.83% | 97.80% | 69.58% | 95.65% |
|
||||
| dom.CoreDocumentImpl | 64.80% | 28.26% | 68.11% | 29.70% |
|
||||
| impl.xs.traversers.XSDHandler | 36.17% | 97.24% | 36.45% | 96.11% |
|
||||
| impl.dtd.DTDGrammar | 87.65% | 6.87% | 52.21% | 94.28% |
|
||||
|
||||
: Evaluation Summary
|
||||
:::
|
||||
|
||||
Precision and Recall, for the optimal configurations found in Section
|
||||
[3](#sec:clustering){reference-type="ref" reference="sec:clustering"},
|
||||
are reported in Table [3](#tab:eval){reference-type="ref"
|
||||
reference="tab:eval"}.
|
||||
[3](#sec:clustering){reference-type="ref" reference="sec:clustering"}, are
|
||||
reported in Table [4](#tab:eval){reference-type="ref" reference="tab:eval"}.
|
||||
|
||||
\begin{center}
|
||||
\color{red} comment precision and recall values
|
||||
\end{center}
|
||||
|
||||
## Practical Usefulness
|
||||
|
||||
Discuss the practical usefulness of the obtained code refactoring
|
||||
assistant in a realistic setting (1 paragraph).
|
||||
\begin{center}
|
||||
\color{red}Discuss the practical usefulness of the obtained code refactoring assistant in a
|
||||
realistic setting (1 paragraph).
|
||||
\end{center}
|
||||
|
||||
|
|
BIN
report/main.pdf
BIN
report/main.pdf
Binary file not shown.
|
@ -8,6 +8,9 @@ import pandas as pd
|
|||
import argparse
|
||||
from k_means import cluster_kmeans
|
||||
from hierarchical import cluster_hierarchical
|
||||
from collections import Counter
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
DIR: str = os.path.dirname(os.path.realpath(__file__))
|
||||
OUT_DIR: str = DIR + '/clustering'
|
||||
|
@ -20,47 +23,91 @@ def clean_output():
|
|||
filelist = glob.glob(OUT_DIR + '/*_silhouette.csv')
|
||||
for f in filelist:
|
||||
os.remove(f)
|
||||
filelist = glob.glob(OUT_DIR + '/*.png')
|
||||
for f in filelist:
|
||||
os.remove(f)
|
||||
|
||||
|
||||
def validate(path: str, clazz_name: str, autorun: bool):
|
||||
def validate(path: str, clazz_name: str, autorun: bool, df_table):
|
||||
df = pd.DataFrame(columns=['k_means', 'hierarchical'], dtype=float)
|
||||
df_stats = pd.DataFrame(columns=['algorithm', 'k', 'min', 'mean', 'max'])
|
||||
|
||||
def add_stat(algo: str, k: int, Y: any, i: int):
|
||||
y_occurs = list(Counter(Y).values()) # count number of elements in each cluster
|
||||
df_stats.loc[i, :] = [algo, k, np.min(y_occurs), np.mean(y_occurs), np.max(y_occurs)]
|
||||
|
||||
# We bound the number of clusters by the number of distinct points in our dataset.
|
||||
# To count them, we compute the number of "distinct" feature vectors and we
|
||||
# bound to the minimum of K_MAX and this number.
|
||||
nodup = pd.read_csv(path, index_col=0).drop_duplicates()
|
||||
max_distinct = len(nodup)
|
||||
print("Max distinct:", max_distinct)
|
||||
limit = min(K_MAX, max_distinct)
|
||||
|
||||
for n in range(2, min(K_MAX, max_distinct)):
|
||||
i: int = 0
|
||||
for n in range(2, limit):
|
||||
X_h, Y_h = cluster_hierarchical(path, n, save_to_disk=False)
|
||||
df.loc[n, 'k_means'] = silhouette_score(X_h, Y_h)
|
||||
add_stat('hierarchical', n, Y_h, i)
|
||||
i += 1
|
||||
df.loc[n, 'hierarchical'] = silhouette_score(X_h, Y_h)
|
||||
|
||||
X_k, Y_k = cluster_kmeans(path, n, save_to_disk=False)
|
||||
df.loc[n, 'hierarchical'] = silhouette_score(X_k, Y_k)
|
||||
add_stat('k_means', n, Y_k, i)
|
||||
i += 1
|
||||
df.loc[n, 'k_means'] = silhouette_score(X_k, Y_k)
|
||||
|
||||
k_kmeans = df[['k_means']].idxmax()[0]
|
||||
k_hierarchical = df[['hierarchical']].idxmax()[0]
|
||||
|
||||
print("K_means optimal value: " + str(k_kmeans))
|
||||
print("Hierarchical optimal value: " + str(k_hierarchical))
|
||||
df_table.loc[clazz_name] = [k_kmeans, 0, k_hierarchical, 0]
|
||||
|
||||
df.to_csv(OUT_DIR + '/' + clazz_name + '_silhouette.csv')
|
||||
df_stats.to_csv(OUT_DIR + '/' + clazz_name + '_stats.csv')
|
||||
|
||||
if autorun:
|
||||
cluster_hierarchical(path, k_hierarchical)
|
||||
cluster_kmeans(path, k_kmeans)
|
||||
|
||||
# Plot stats
|
||||
sns.set_theme(palette="hls")
|
||||
|
||||
# Initialize the matplotlib figure
|
||||
f = plt.figure(figsize=(14, 12))
|
||||
gs = f.add_gridspec(2, 2)
|
||||
ax1 = f.add_subplot(gs[0, 0])
|
||||
ax2 = f.add_subplot(gs[0, 1])
|
||||
ax3 = f.add_subplot(gs[1, :])
|
||||
|
||||
df_k = df_stats.loc[df_stats.algorithm == 'k_means', ['k', 'min', 'mean', 'max']].set_index('k', drop=True)
|
||||
df_h = df_stats.loc[df_stats.algorithm == 'hierarchical', ['k', 'min', 'mean', 'max']].set_index('k', drop=True)
|
||||
|
||||
sns.lineplot(data=df_k, palette="tab10", ax=ax1)
|
||||
sns.lineplot(data=df_h, palette="tab10", ax=ax2)
|
||||
sns.lineplot(data=df, palette="tab10", ax=ax3)
|
||||
|
||||
# Add a legend and informative axis label
|
||||
ax1.set(ylabel="# of elements", ylim=[0, 130], xlabel="# of clusters", xlim=[2, limit])
|
||||
ax1.set_title("K-Means cluster sizes")
|
||||
ax2.set(ylabel="# of elements", ylim=[0, 130], xlabel="# of clusters", xlim=[2, limit])
|
||||
ax2.set_title("Hierarchical cluster sizes")
|
||||
ax3.set(ylabel="Silhouette", ylim=[0, 1], xlabel="# of clusters", xlim=[2, limit])
|
||||
ax3.set_title("Silhouette metrics per # of clusters")
|
||||
|
||||
sns.despine(left=True, bottom=True)
|
||||
f.savefig(OUT_DIR + '/' + clazz_name + '_stats.png')
|
||||
plt.clf()
|
||||
|
||||
|
||||
def compute_silhouette(path: str, clazz_name: str, suffix: str):
|
||||
def compute_silhouette(path: str, clazz_name: str, suffix: str) -> float:
|
||||
df_y = pd.read_csv(OUT_DIR + '/' + clazz_name + '_' + suffix + '.csv')
|
||||
Y = df_y.iloc[:, 1].values
|
||||
|
||||
df = pd.read_csv(path)
|
||||
X = df.drop(df.columns[0], axis=1).to_numpy()
|
||||
|
||||
print("Silhouette for " + suffix + ": " + str(silhouette_score(X, Y)))
|
||||
s = round(silhouette_score(X, Y), 4)
|
||||
|
||||
print("Silhouette for " + suffix + ": " + str(s))
|
||||
return s
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -70,26 +117,30 @@ def main():
|
|||
parser.add_argument('--autorun', action='store_true',
|
||||
help='if validating, computes CSV for optimal clustering automatically')
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.validate:
|
||||
clean_output()
|
||||
|
||||
df_table = pd.DataFrame(columns=['KMeans K', 'KMeans silhouette', 'Hierarchical K', 'Hierarchical silhouette'])
|
||||
|
||||
filelist = glob.glob(IN_DIR + '/*.csv')
|
||||
for f in filelist:
|
||||
clazz_name = os.path.basename(f)
|
||||
clazz_name = clazz_name[:clazz_name.rfind('.')]
|
||||
|
||||
print(clazz_name)
|
||||
if args.validate:
|
||||
validate(f, clazz_name, args.autorun, df_table)
|
||||
|
||||
sk = compute_silhouette(f, clazz_name, 'kmeans')
|
||||
sh = compute_silhouette(f, clazz_name, 'hierarchical')
|
||||
|
||||
if args.validate:
|
||||
validate(f, clazz_name, args.autorun)
|
||||
df_table.loc[clazz_name, 'KMeans silhouette'] = sk
|
||||
df_table.loc[clazz_name, 'Hierarchical silhouette'] = sh
|
||||
|
||||
compute_silhouette(f, clazz_name, 'kmeans')
|
||||
compute_silhouette(f, clazz_name, 'hierarchical')
|
||||
|
||||
print()
|
||||
df_table.index.name = 'Class Name'
|
||||
print(df_table.to_markdown())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Reference in a new issue