diff --git a/clustering/org.apache.xerces.dom.CoreDocumentImpl_silhouette.csv b/clustering/org.apache.xerces.dom.CoreDocumentImpl_silhouette.csv index 1ab076c..4519a8b 100644 --- a/clustering/org.apache.xerces.dom.CoreDocumentImpl_silhouette.csv +++ b/clustering/org.apache.xerces.dom.CoreDocumentImpl_silhouette.csv @@ -1,45 +1,45 @@ ,k_means,hierarchical -2,0.7008424223503156,0.3958383820498147 -3,0.5710705368479275,0.4083355324423938 -4,0.5612355754261723,0.4894345431495262 -5,0.45447105698494905,0.49390485171067744 -6,0.4542950961743021,0.49437178337314974 -7,0.5169337345938171,0.4996628258355101 -8,0.5048012323625627,0.504680719000111 -9,0.4981437021345769,0.5104029882614454 -10,0.514873610056946,0.39391549620101274 -11,0.4397616290614397,0.35593829934237226 -12,0.3966368345309925,0.3965649809723018 -13,0.40515142998089104,0.4035942512051252 -14,0.40783453521401053,0.41018624058063885 -15,0.4239033913796109,0.45557751119565765 -16,0.42065530265413026,0.47640709656766556 -17,0.44344469866152514,0.4974425160835303 -18,0.4400719065542468,0.5290487299051633 -19,0.44608395823875535,0.5485454650471248 -20,0.44877269935654723,0.5586056973417746 -21,0.48118392208651517,0.5385866967307906 -22,0.48389798035280496,0.538222592035968 -23,0.48663428414368126,0.550727295003801 -24,0.5087496231379599,0.5729072600132372 -25,0.5308958702007723,0.5954078415061489 -26,0.533742178035476,0.6182602907647171 -27,0.5366335268898433,0.6415000474402278 -28,0.5688721496510291,0.6464201697751911 -29,0.5718756117789308,0.6701951689242575 -30,0.5749678644659783,0.6738663960033637 -31,0.5975986067541601,0.6776704976739869 -32,0.6010454124801283,0.6759936834928909 -33,0.6047324451505658,0.6780082327270405 -34,0.6087467116081876,0.6819745883778254 -35,0.613131689815019,0.6860900076219251 -36,0.6386708325196511,0.6921336553243742 -37,0.6449490032291169,0.6964066920515507 -38,0.6764810977640761,0.6932020971027025 -39,0.6773895830074159,0.6977143227629022 -40,0.6918179479278735,0.7024070374495096 -41,0.6964034645667346,0.7072912915110808 -42,0.7024471122691838,0.712379056158551 -43,0.7256701207957181,0.7176833214293175 -44,0.7230610997944976,0.7232182069292477 -45,0.7289990873402858,0.7289990873402857 +2,0.3958383820498147,0.7008424223503156 +3,0.4083355324423938,0.5710705368479275 +4,0.4894345431495262,0.5612355754261723 +5,0.49390485171067744,0.45447105698494905 +6,0.49437178337314974,0.4542950961743021 +7,0.4996628258355101,0.5169337345938171 +8,0.504680719000111,0.5048012323625627 +9,0.5104029882614454,0.4981437021345769 +10,0.39391549620101274,0.514873610056946 +11,0.35593829934237226,0.4397616290614397 +12,0.3965649809723018,0.3966368345309925 +13,0.4035942512051252,0.40515142998089104 +14,0.41018624058063885,0.40783453521401053 +15,0.45557751119565765,0.4239033913796109 +16,0.47640709656766556,0.42065530265413026 +17,0.4974425160835303,0.44344469866152514 +18,0.5290487299051633,0.4400719065542468 +19,0.5485454650471248,0.44608395823875535 +20,0.5586056973417746,0.44877269935654723 +21,0.5385866967307906,0.48118392208651517 +22,0.538222592035968,0.48389798035280496 +23,0.550727295003801,0.48663428414368126 +24,0.5729072600132372,0.5087496231379599 +25,0.5954078415061489,0.5308958702007723 +26,0.6182602907647171,0.533742178035476 +27,0.6415000474402278,0.5366335268898433 +28,0.6464201697751911,0.5688721496510291 +29,0.6701951689242575,0.5718756117789308 +30,0.6738663960033637,0.5749678644659783 +31,0.6776704976739869,0.5975986067541601 +32,0.6759936834928909,0.6010454124801283 +33,0.6780082327270405,0.6047324451505658 +34,0.6819745883778254,0.6087467116081876 +35,0.6860900076219251,0.613131689815019 +36,0.6921336553243742,0.6386708325196511 +37,0.6964066920515507,0.6449490032291169 +38,0.6932020971027025,0.6764810977640761 +39,0.6977143227629022,0.6773895830074159 +40,0.7024070374495096,0.6918179479278735 +41,0.7072912915110808,0.6964034645667346 +42,0.712379056158551,0.7024471122691838 +43,0.7176833214293175,0.7256701207957181 +44,0.7232182069292477,0.7230610997944976 +45,0.7289990873402857,0.7289990873402858 diff --git a/clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.csv b/clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.csv new file mode 100644 index 0000000..f0ef1a7 --- /dev/null +++ b/clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.csv @@ -0,0 +1,89 @@ +,algorithm,k,min,mean,max +0,hierarchical,2,1,58.5,116 +1,k_means,2,1,58.5,116 +2,hierarchical,3,1,39.0,113 +3,k_means,3,1,39.0,115 +4,hierarchical,4,1,29.25,113 +5,k_means,4,1,29.25,98 +6,hierarchical,5,1,23.4,111 +7,k_means,5,1,23.4,98 +8,hierarchical,6,1,19.5,111 +9,k_means,6,1,19.5,98 +10,hierarchical,7,1,16.714285714285715,97 +11,k_means,7,1,16.714285714285715,98 +12,hierarchical,8,1,14.625,97 +13,k_means,8,1,14.625,98 +14,hierarchical,9,1,13.0,96 +15,k_means,9,1,13.0,97 +16,hierarchical,10,1,11.7,96 +17,k_means,10,1,11.7,92 +18,hierarchical,11,1,10.636363636363637,93 +19,k_means,11,1,10.636363636363637,89 +20,hierarchical,12,1,9.75,86 +21,k_means,12,1,9.75,84 +22,hierarchical,13,1,9.0,84 +23,k_means,13,1,9.0,83 +24,hierarchical,14,1,8.357142857142858,84 +25,k_means,14,1,8.357142857142858,83 +26,hierarchical,15,1,7.8,84 +27,k_means,15,1,7.8,77 +28,hierarchical,16,1,7.3125,84 +29,k_means,16,1,7.3125,75 +30,hierarchical,17,1,6.882352941176471,79 +31,k_means,17,1,6.882352941176471,73 +32,hierarchical,18,1,6.5,79 +33,k_means,18,1,6.5,70 +34,hierarchical,19,1,6.157894736842105,46 +35,k_means,19,1,6.157894736842105,70 +36,hierarchical,20,1,5.85,46 +37,k_means,20,1,5.85,70 +38,hierarchical,21,1,5.571428571428571,46 +39,k_means,21,1,5.571428571428571,70 +40,hierarchical,22,1,5.318181818181818,46 +41,k_means,22,1,5.318181818181818,70 +42,hierarchical,23,1,5.086956521739131,46 +43,k_means,23,1,5.086956521739131,68 +44,hierarchical,24,1,4.875,46 +45,k_means,24,1,4.875,66 +46,hierarchical,25,1,4.68,46 +47,k_means,25,1,4.68,64 +48,hierarchical,26,1,4.5,46 +49,k_means,26,1,4.5,62 +50,hierarchical,27,1,4.333333333333333,46 +51,k_means,27,1,4.333333333333333,60 +52,hierarchical,28,1,4.178571428571429,46 +53,k_means,28,1,4.178571428571429,60 +54,hierarchical,29,1,4.0344827586206895,46 +55,k_means,29,1,4.0344827586206895,58 +56,hierarchical,30,1,3.9,46 +57,k_means,30,1,3.9,57 +58,hierarchical,31,1,3.774193548387097,46 +59,k_means,31,1,3.774193548387097,56 +60,hierarchical,32,1,3.65625,46 +61,k_means,32,1,3.65625,56 +62,hierarchical,33,1,3.5454545454545454,46 +63,k_means,33,1,3.5454545454545454,56 +64,hierarchical,34,1,3.4411764705882355,46 +65,k_means,34,1,3.4411764705882355,55 +66,hierarchical,35,1,3.342857142857143,46 +67,k_means,35,1,3.342857142857143,54 +68,hierarchical,36,1,3.25,46 +69,k_means,36,1,3.25,54 +70,hierarchical,37,1,3.1621621621621623,46 +71,k_means,37,1,3.1621621621621623,53 +72,hierarchical,38,1,3.0789473684210527,46 +73,k_means,38,1,3.0789473684210527,53 +74,hierarchical,39,1,3.0,46 +75,k_means,39,1,3.0,52 +76,hierarchical,40,1,2.925,46 +77,k_means,40,1,2.925,51 +78,hierarchical,41,1,2.8536585365853657,46 +79,k_means,41,1,2.8536585365853657,50 +80,hierarchical,42,1,2.7857142857142856,46 +81,k_means,42,1,2.7857142857142856,49 +82,hierarchical,43,1,2.7209302325581395,46 +83,k_means,43,1,2.7209302325581395,48 +84,hierarchical,44,1,2.659090909090909,46 +85,k_means,44,1,2.659090909090909,47 +86,hierarchical,45,1,2.6,46 +87,k_means,45,1,2.6,46 diff --git a/clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.png b/clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.png new file mode 100644 index 0000000..f04b583 Binary files /dev/null and b/clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.png differ diff --git a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_hierarchical.csv b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_hierarchical.csv index 868f352..71fb465 100644 --- a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_hierarchical.csv +++ b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_hierarchical.csv @@ -1,92 +1,92 @@ ,cluster -getGrammarDescription,5 +getGrammarDescription,0 getElementDeclIsExternal,0 getAttributeDeclIsExternal,0 -getAttributeDeclIndex,42 -startDTD,2 -startParameterEntity,6 -startExternalSubset,41 -endParameterEntity,6 -endExternalSubset,41 -elementDecl,31 -attributeDecl,39 -internalEntityDecl,40 -externalEntityDecl,40 -unparsedEntityDecl,40 -notationDecl,38 -endDTD,8 -setDTDSource,23 -getDTDSource,23 -textDecl,5 -comment,5 -processingInstruction,5 -startAttlist,5 -endAttlist,5 -startConditional,5 -ignoredCharacters,5 -endConditional,5 -setDTDContentModelSource,11 -getDTDContentModelSource,11 -startContentModel,33 -startGroup,53 -pcdata,51 -element,34 -separator,44 -occurrence,32 -endGroup,35 -any,5 -empty,5 -endContentModel,5 -isNamespaceAware,5 -getSymbolTable,52 -getFirstElementDeclIndex,7 -getNextElementDeclIndex,7 -getElementDeclIndex,55 -getContentSpecType,37 -getElementDecl,54 -getElementDeclName,8 -getFirstAttributeDeclIndex,20 -getNextAttributeDeclIndex,57 -getAttributeDecl,28 -isCDATAAttribute,47 -getEntityDeclIndex,5 -getEntityDecl,1 -getNotationDeclIndex,5 -getNotationDecl,10 -getContentSpec,9 -getContentSpecIndex,19 -getContentSpecAsString,43 -printElements,50 -printAttributes,49 -addContentSpecToElement,29 -getElementContentModelValidator,25 -createElementDecl,36 -setElementDecl,17 -putElementNameMapping,5 -setFirstAttributeDeclIndex,20 -setContentSpecIndex,19 -createAttributeDecl,18 -setAttributeDecl,48 -createContentSpec,21 -setContentSpec,9 -createEntityDecl,16 -setEntityDecl,1 -createNotationDecl,56 -setNotationDecl,10 -addContentSpecNode,4 -addUniqueLeafNode,4 -initializeContentModelStack,2 -isImmutable,24 -appendContentSpec,45 -printAttribute,26 -createChildModel,3 -buildSyntaxTree,3 -contentSpecTree,46 -ensureElementDeclCapacity,27 -ensureAttributeDeclCapacity,30 -ensureEntityDeclCapacity,14 -ensureNotationDeclCapacity,13 -ensureContentSpecCapacity,12 -resize,5 -isEntityDeclared,22 -isEntityUnparsed,15 +getAttributeDeclIndex,0 +startDTD,0 +startParameterEntity,0 +startExternalSubset,0 +endParameterEntity,0 +endExternalSubset,0 +elementDecl,1 +attributeDecl,1 +internalEntityDecl,0 +externalEntityDecl,0 +unparsedEntityDecl,0 +notationDecl,0 +endDTD,0 +setDTDSource,0 +getDTDSource,0 +textDecl,0 +comment,0 +processingInstruction,0 +startAttlist,0 +endAttlist,0 +startConditional,0 +ignoredCharacters,0 +endConditional,0 +setDTDContentModelSource,0 +getDTDContentModelSource,0 +startContentModel,0 +startGroup,0 +pcdata,0 +element,0 +separator,0 +occurrence,0 +endGroup,0 +any,0 +empty,0 +endContentModel,0 +isNamespaceAware,0 +getSymbolTable,0 +getFirstElementDeclIndex,0 +getNextElementDeclIndex,0 +getElementDeclIndex,0 +getContentSpecType,0 +getElementDecl,0 +getElementDeclName,0 +getFirstAttributeDeclIndex,0 +getNextAttributeDeclIndex,0 +getAttributeDecl,0 +isCDATAAttribute,0 +getEntityDeclIndex,0 +getEntityDecl,0 +getNotationDeclIndex,0 +getNotationDecl,0 +getContentSpec,0 +getContentSpecIndex,0 +getContentSpecAsString,0 +printElements,0 +printAttributes,0 +addContentSpecToElement,0 +getElementContentModelValidator,0 +createElementDecl,0 +setElementDecl,0 +putElementNameMapping,0 +setFirstAttributeDeclIndex,0 +setContentSpecIndex,0 +createAttributeDecl,0 +setAttributeDecl,0 +createContentSpec,0 +setContentSpec,0 +createEntityDecl,0 +setEntityDecl,0 +createNotationDecl,0 +setNotationDecl,0 +addContentSpecNode,0 +addUniqueLeafNode,0 +initializeContentModelStack,0 +isImmutable,0 +appendContentSpec,0 +printAttribute,0 +createChildModel,0 +buildSyntaxTree,0 +contentSpecTree,0 +ensureElementDeclCapacity,0 +ensureAttributeDeclCapacity,0 +ensureEntityDeclCapacity,0 +ensureNotationDeclCapacity,0 +ensureContentSpecCapacity,0 +resize,0 +isEntityDeclared,0 +isEntityUnparsed,0 diff --git a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_kmeans.csv b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_kmeans.csv index 9e12509..3fef18a 100644 --- a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_kmeans.csv +++ b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_kmeans.csv @@ -1,92 +1,92 @@ ,cluster -getGrammarDescription,1 -getElementDeclIsExternal,0 -getAttributeDeclIsExternal,0 -getAttributeDeclIndex,1 -startDTD,1 -startParameterEntity,1 -startExternalSubset,1 -endParameterEntity,1 -endExternalSubset,1 -elementDecl,0 -attributeDecl,0 -internalEntityDecl,1 -externalEntityDecl,1 -unparsedEntityDecl,1 -notationDecl,1 -endDTD,0 -setDTDSource,1 -getDTDSource,1 -textDecl,1 -comment,1 -processingInstruction,1 -startAttlist,1 -endAttlist,1 -startConditional,1 -ignoredCharacters,1 -endConditional,1 -setDTDContentModelSource,1 -getDTDContentModelSource,1 -startContentModel,1 -startGroup,1 -pcdata,1 -element,1 -separator,1 -occurrence,1 -endGroup,1 -any,1 -empty,1 -endContentModel,1 -isNamespaceAware,1 -getSymbolTable,1 -getFirstElementDeclIndex,1 -getNextElementDeclIndex,1 -getElementDeclIndex,1 -getContentSpecType,0 +getGrammarDescription,6 +getElementDeclIsExternal,50 +getAttributeDeclIsExternal,43 +getAttributeDeclIndex,36 +startDTD,13 +startParameterEntity,23 +startExternalSubset,54 +endParameterEntity,23 +endExternalSubset,54 +elementDecl,5 +attributeDecl,4 +internalEntityDecl,17 +externalEntityDecl,17 +unparsedEntityDecl,17 +notationDecl,40 +endDTD,29 +setDTDSource,53 +getDTDSource,53 +textDecl,6 +comment,6 +processingInstruction,6 +startAttlist,6 +endAttlist,6 +startConditional,6 +ignoredCharacters,6 +endConditional,6 +setDTDContentModelSource,51 +getDTDContentModelSource,51 +startContentModel,28 +startGroup,39 +pcdata,52 +element,2 +separator,49 +occurrence,24 +endGroup,27 +any,6 +empty,6 +endContentModel,6 +isNamespaceAware,6 +getSymbolTable,56 +getFirstElementDeclIndex,47 +getNextElementDeclIndex,47 +getElementDeclIndex,57 +getContentSpecType,38 getElementDecl,0 -getElementDeclName,0 -getFirstAttributeDeclIndex,0 -getNextAttributeDeclIndex,0 -getAttributeDecl,0 -isCDATAAttribute,1 -getEntityDeclIndex,1 -getEntityDecl,0 -getNotationDeclIndex,1 -getNotationDecl,0 -getContentSpec,0 -getContentSpecIndex,0 -getContentSpecAsString,0 -printElements,1 -printAttributes,1 -addContentSpecToElement,1 -getElementContentModelValidator,0 -createElementDecl,0 -setElementDecl,0 -putElementNameMapping,1 -setFirstAttributeDeclIndex,0 -setContentSpecIndex,0 -createAttributeDecl,0 -setAttributeDecl,0 -createContentSpec,0 -setContentSpec,0 -createEntityDecl,0 -setEntityDecl,0 -createNotationDecl,1 -setNotationDecl,0 -addContentSpecNode,1 -addUniqueLeafNode,1 -initializeContentModelStack,1 -isImmutable,1 -appendContentSpec,1 +getElementDeclName,29 +getFirstAttributeDeclIndex,3 +getNextAttributeDeclIndex,46 +getAttributeDecl,25 +isCDATAAttribute,44 +getEntityDeclIndex,6 +getEntityDecl,8 +getNotationDeclIndex,6 +getNotationDecl,10 +getContentSpec,41 +getContentSpecIndex,12 +getContentSpecAsString,37 +printElements,55 +printAttributes,35 +addContentSpecToElement,20 +getElementContentModelValidator,21 +createElementDecl,33 +setElementDecl,16 +putElementNameMapping,6 +setFirstAttributeDeclIndex,3 +setContentSpecIndex,12 +createAttributeDecl,19 +setAttributeDecl,7 +createContentSpec,41 +setContentSpec,9 +createEntityDecl,31 +setEntityDecl,8 +createNotationDecl,32 +setNotationDecl,10 +addContentSpecNode,18 +addUniqueLeafNode,18 +initializeContentModelStack,13 +isImmutable,6 +appendContentSpec,42 printAttribute,1 -createChildModel,1 -buildSyntaxTree,1 -contentSpecTree,1 -ensureElementDeclCapacity,1 -ensureAttributeDeclCapacity,1 -ensureEntityDeclCapacity,1 -ensureNotationDeclCapacity,1 -ensureContentSpecCapacity,1 -resize,1 -isEntityDeclared,1 -isEntityUnparsed,0 +createChildModel,11 +buildSyntaxTree,11 +contentSpecTree,30 +ensureElementDeclCapacity,15 +ensureAttributeDeclCapacity,14 +ensureEntityDeclCapacity,22 +ensureNotationDeclCapacity,26 +ensureContentSpecCapacity,34 +resize,6 +isEntityDeclared,48 +isEntityUnparsed,45 diff --git a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_silhouette.csv b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_silhouette.csv index 8279968..9ed92c6 100644 --- a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_silhouette.csv +++ b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_silhouette.csv @@ -1,64 +1,64 @@ ,k_means,hierarchical -2,0.43549549160206547,0.22916634455195753 -3,0.3737398924595095,0.2246280732293034 -4,0.3557451009153901,0.22489420158108267 -5,0.23295505680144496,0.23659327576115802 -6,0.262133112331066,0.1944787865029721 -7,0.2578980101543562,0.14449036253228517 -8,0.2549368125378225,0.14148366678653188 -9,0.2774793093993747,0.13842552961645824 -10,0.29633149188806335,0.17251507022640497 -11,0.28457149559807815,0.20347568890084347 -12,0.2774764884391462,0.23906895503283213 -13,0.2807117319594596,0.2433263434151139 -14,0.2756438988231549,0.2378679295617759 -15,0.2725133030686268,0.23691994972126937 -16,0.26609972785171476,0.23116431400607626 -17,0.2622978716191777,0.250626112587838 -18,0.2599277555662332,0.25367962227891766 -19,0.2627008352505403,0.27152241207311917 -20,0.27904812684322156,0.2937526253744639 -21,0.2862853638532431,0.29866907908096096 -22,0.28363618305324206,0.2982900685039696 -23,0.27298124922178313,0.29267556171442216 -24,0.2755401967064185,0.30932258932020334 -25,0.2699256899168711,0.30931433471981734 -26,0.27327610109462835,0.309284891816073 -27,0.2741779110906256,0.30820922828647973 -28,0.2772726745209296,0.306394576589556 -29,0.2763152122041744,0.31629054291989955 -30,0.27822954116587556,0.31889378927031037 -31,0.2765547788352012,0.31793632695355517 -32,0.2873045247363621,0.3198803243841521 -33,0.28417974562649284,0.3322984924566154 -34,0.2685472504040367,0.33205224383012144 -35,0.2640970877653046,0.32946433944653786 -36,0.26594127941463497,0.34064178452545657 -37,0.2671662834055061,0.34279546744648637 -38,0.26972862144514015,0.3520414342812306 -39,0.2745566131731437,0.35684038034252413 -40,0.3085760240111521,0.34927826706954956 -41,0.32756637032777863,0.3490443084779255 -42,0.3310796986888577,0.34372287471805796 -43,0.32889480000768656,0.32421890240508233 -44,0.31610864049926274,0.27315698867962007 -45,0.3140921194105564,0.27051011105427114 -46,0.3088953240503273,0.29627121773250714 -47,0.2693097731576138,0.32261382027270064 -48,0.2809797636777669,0.3299248655060567 -49,0.29384518410058824,0.3171387059976329 -50,0.29793575895571417,0.3442080317722919 -51,0.3025569827442159,0.3408776851426114 -52,0.32032808958922193,0.3408776851426114 -53,0.33852852210954587,0.33765907834246356 -54,0.339541278009214,0.36565310355269914 -55,0.35774171052953796,0.3818328805784584 -56,0.37594214304986195,0.3866470678901348 -57,0.4080257854586148,0.3919955336887361 -58,0.4046954388289342,0.3979724365432809 -59,0.4046954388289342,0.38857621891133143 -60,0.3931263574608019,0.3953492191827632 -61,0.38155727609266954,0.38748610984623766 -62,0.37132316722174985,0.39516141319506437 -63,0.36810456042160206,0.3850224051641811 -64,0.3565354790534698,0.3785851915638855 +2,0.22916634455195753,0.43549549160206547 +3,0.2246280732293034,0.3737398924595095 +4,0.22489420158108267,0.3557451009153901 +5,0.23659327576115802,0.23295505680144496 +6,0.1944787865029721,0.262133112331066 +7,0.14449036253228517,0.2578980101543562 +8,0.14148366678653188,0.2549368125378225 +9,0.13842552961645824,0.2774793093993747 +10,0.17251507022640497,0.29633149188806335 +11,0.20347568890084347,0.28457149559807815 +12,0.23906895503283213,0.2774764884391462 +13,0.2433263434151139,0.2807117319594596 +14,0.2378679295617759,0.2756438988231549 +15,0.23691994972126937,0.2725133030686268 +16,0.23116431400607626,0.26609972785171476 +17,0.250626112587838,0.2622978716191777 +18,0.25367962227891766,0.2599277555662332 +19,0.27152241207311917,0.2627008352505403 +20,0.2937526253744639,0.27904812684322156 +21,0.29866907908096096,0.2862853638532431 +22,0.2982900685039696,0.28363618305324206 +23,0.29267556171442216,0.27298124922178313 +24,0.30932258932020334,0.2755401967064185 +25,0.30931433471981734,0.2699256899168711 +26,0.309284891816073,0.27327610109462835 +27,0.30820922828647973,0.2741779110906256 +28,0.306394576589556,0.2772726745209296 +29,0.31629054291989955,0.2763152122041744 +30,0.31889378927031037,0.27822954116587556 +31,0.31793632695355517,0.2765547788352012 +32,0.3198803243841521,0.2873045247363621 +33,0.3322984924566154,0.28417974562649284 +34,0.33205224383012144,0.2685472504040367 +35,0.32946433944653786,0.2640970877653046 +36,0.34064178452545657,0.26594127941463497 +37,0.34279546744648637,0.2671662834055061 +38,0.3520414342812306,0.26972862144514015 +39,0.35684038034252413,0.2745566131731437 +40,0.34927826706954956,0.3085760240111521 +41,0.3490443084779255,0.32756637032777863 +42,0.34372287471805796,0.3310796986888577 +43,0.32421890240508233,0.32889480000768656 +44,0.27315698867962007,0.31610864049926274 +45,0.27051011105427114,0.3140921194105564 +46,0.29627121773250714,0.3088953240503273 +47,0.32261382027270064,0.2693097731576138 +48,0.3299248655060567,0.2809797636777669 +49,0.3171387059976329,0.29384518410058824 +50,0.3442080317722919,0.29793575895571417 +51,0.3408776851426114,0.3025569827442159 +52,0.3408776851426114,0.32032808958922193 +53,0.33765907834246356,0.33852852210954587 +54,0.36565310355269914,0.339541278009214 +55,0.3818328805784584,0.35774171052953796 +56,0.3866470678901348,0.37594214304986195 +57,0.3919955336887361,0.4080257854586148 +58,0.3979724365432809,0.4046954388289342 +59,0.38857621891133143,0.4046954388289342 +60,0.3953492191827632,0.3931263574608019 +61,0.38748610984623766,0.38155727609266954 +62,0.39516141319506437,0.37132316722174985 +63,0.3850224051641811,0.36810456042160206 +64,0.3785851915638855,0.3565354790534698 diff --git a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.csv b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.csv new file mode 100644 index 0000000..6e888c2 --- /dev/null +++ b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.csv @@ -0,0 +1,127 @@ +,algorithm,k,min,mean,max +0,hierarchical,2,2,45.5,89 +1,k_means,2,29,45.5,62 +2,hierarchical,3,2,30.333333333333332,85 +3,k_means,3,3,30.333333333333332,62 +4,hierarchical,4,2,22.75,82 +5,k_means,4,3,22.75,56 +6,hierarchical,5,2,18.2,64 +7,k_means,5,3,18.2,55 +8,hierarchical,6,2,15.166666666666666,57 +9,k_means,6,3,15.166666666666666,51 +10,hierarchical,7,1,13.0,57 +11,k_means,7,2,13.0,50 +12,hierarchical,8,1,11.375,57 +13,k_means,8,1,11.375,57 +14,hierarchical,9,1,10.11111111111111,57 +15,k_means,9,1,10.11111111111111,57 +16,hierarchical,10,1,9.1,50 +17,k_means,10,1,9.1,51 +18,hierarchical,11,1,8.272727272727273,46 +19,k_means,11,1,8.272727272727273,50 +20,hierarchical,12,1,7.583333333333333,46 +21,k_means,12,1,7.583333333333333,47 +22,hierarchical,13,1,7.0,46 +23,k_means,13,1,7.0,46 +24,hierarchical,14,1,6.5,46 +25,k_means,14,1,6.5,46 +26,hierarchical,15,1,6.066666666666666,46 +27,k_means,15,1,6.066666666666666,46 +28,hierarchical,16,1,5.6875,46 +29,k_means,16,1,5.6875,46 +30,hierarchical,17,1,5.352941176470588,39 +31,k_means,17,1,5.352941176470588,44 +32,hierarchical,18,1,5.055555555555555,39 +33,k_means,18,1,5.055555555555555,44 +34,hierarchical,19,1,4.7894736842105265,34 +35,k_means,19,1,4.7894736842105265,43 +36,hierarchical,20,1,4.55,34 +37,k_means,20,1,4.55,41 +38,hierarchical,21,1,4.333333333333333,34 +39,k_means,21,1,4.333333333333333,40 +40,hierarchical,22,1,4.136363636363637,34 +41,k_means,22,1,4.136363636363637,40 +42,hierarchical,23,1,3.9565217391304346,31 +43,k_means,23,1,3.9565217391304346,40 +44,hierarchical,24,1,3.7916666666666665,31 +45,k_means,24,1,3.7916666666666665,38 +46,hierarchical,25,1,3.64,31 +47,k_means,25,1,3.64,38 +48,hierarchical,26,1,3.5,31 +49,k_means,26,1,3.5,38 +50,hierarchical,27,1,3.3703703703703702,31 +51,k_means,27,1,3.3703703703703702,38 +52,hierarchical,28,1,3.25,30 +53,k_means,28,1,3.25,38 +54,hierarchical,29,1,3.1379310344827585,30 +55,k_means,29,1,3.1379310344827585,36 +56,hierarchical,30,1,3.033333333333333,30 +57,k_means,30,1,3.033333333333333,35 +58,hierarchical,31,1,2.935483870967742,30 +59,k_means,31,1,2.935483870967742,35 +60,hierarchical,32,1,2.84375,30 +61,k_means,32,1,2.84375,35 +62,hierarchical,33,1,2.757575757575758,30 +63,k_means,33,1,2.757575757575758,33 +64,hierarchical,34,1,2.676470588235294,30 +65,k_means,34,1,2.676470588235294,33 +66,hierarchical,35,1,2.6,30 +67,k_means,35,1,2.6,33 +68,hierarchical,36,1,2.5277777777777777,30 +69,k_means,36,1,2.5277777777777777,33 +70,hierarchical,37,1,2.4594594594594597,30 +71,k_means,37,1,2.4594594594594597,33 +72,hierarchical,38,1,2.3947368421052633,30 +73,k_means,38,1,2.3947368421052633,33 +74,hierarchical,39,1,2.3333333333333335,29 +75,k_means,39,1,2.3333333333333335,32 +76,hierarchical,40,1,2.275,29 +77,k_means,40,1,2.275,32 +78,hierarchical,41,1,2.2195121951219514,29 +79,k_means,41,1,2.2195121951219514,32 +80,hierarchical,42,1,2.1666666666666665,29 +81,k_means,42,1,2.1666666666666665,32 +82,hierarchical,43,1,2.116279069767442,29 +83,k_means,43,1,2.116279069767442,31 +84,hierarchical,44,1,2.0681818181818183,29 +85,k_means,44,1,2.0681818181818183,31 +86,hierarchical,45,1,2.022222222222222,29 +87,k_means,45,1,2.022222222222222,31 +88,hierarchical,46,1,1.9782608695652173,29 +89,k_means,46,1,1.9782608695652173,29 +90,hierarchical,47,1,1.9361702127659575,28 +91,k_means,47,1,1.9361702127659575,27 +92,hierarchical,48,1,1.8958333333333333,17 +93,k_means,48,1,1.8958333333333333,27 +94,hierarchical,49,1,1.8571428571428572,17 +95,k_means,49,1,1.8571428571428572,27 +96,hierarchical,50,1,1.82,17 +97,k_means,50,1,1.82,25 +98,hierarchical,51,1,1.7843137254901962,17 +99,k_means,51,1,1.7843137254901962,25 +100,hierarchical,52,1,1.75,17 +101,k_means,52,1,1.75,25 +102,hierarchical,53,1,1.7169811320754718,17 +103,k_means,53,1,1.7169811320754718,25 +104,hierarchical,54,1,1.6851851851851851,17 +105,k_means,54,1,1.6851851851851851,23 +106,hierarchical,55,1,1.6545454545454545,17 +107,k_means,55,1,1.6545454545454545,21 +108,hierarchical,56,1,1.625,17 +109,k_means,56,1,1.625,20 +110,hierarchical,57,1,1.5964912280701755,17 +111,k_means,57,1,1.5964912280701755,19 +112,hierarchical,58,1,1.5689655172413792,17 +113,k_means,58,1,1.5689655172413792,18 +114,hierarchical,59,1,1.5423728813559323,17 +115,k_means,59,1,1.5423728813559323,18 +116,hierarchical,60,1,1.5166666666666666,17 +117,k_means,60,1,1.5166666666666666,17 +118,hierarchical,61,1,1.4918032786885247,17 +119,k_means,61,1,1.4918032786885247,17 +120,hierarchical,62,1,1.467741935483871,17 +121,k_means,62,1,1.467741935483871,16 +122,hierarchical,63,1,1.4444444444444444,17 +123,k_means,63,1,1.4444444444444444,16 +124,hierarchical,64,1,1.421875,17 +125,k_means,64,1,1.421875,16 diff --git a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.png b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.png new file mode 100644 index 0000000..dd5fb9b Binary files /dev/null and b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.png differ diff --git a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_hierarchical.csv b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_hierarchical.csv index 590e725..6b64892 100644 --- a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_hierarchical.csv +++ b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_hierarchical.csv @@ -11,7 +11,7 @@ constructTrees,0 isExistingGrammar,0 updateImportListFor,0 updateImportListWith,0 -buildGlobalNameRegistries,0 +buildGlobalNameRegistries,2 traverseSchemas,0 needReportTNSError,0 addGlobalAttributeDecl,0 diff --git a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_kmeans.csv b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_kmeans.csv index 2d40cfd..590e725 100644 --- a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_kmeans.csv +++ b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_kmeans.csv @@ -36,7 +36,7 @@ getGrpOrAttrGrpRedefinedByRestriction,0 resolveKeyRefs,0 getIDRegistry,0 getIDRegistry_sub,0 -storeKeyRef,2 +storeKeyRef,0 resolveSchema,0 resolveSchemaSource,0 getSchemaDocument,0 diff --git a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_silhouette.csv b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_silhouette.csv index 27e595c..e696220 100644 --- a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_silhouette.csv +++ b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_silhouette.csv @@ -1,64 +1,64 @@ ,k_means,hierarchical 2,0.598553678618089,0.598553678618089 -3,0.5988635577957939,0.4340479638200015 -4,0.5768968986366794,0.4446910360233003 -5,0.5600582308059449,0.44970649031040394 -6,0.4754961922118064,0.42291318953408236 -7,0.4716129791423394,0.37302776265331616 -8,0.4215235973451702,0.16585341129364783 -9,0.4270412461769427,0.1805562766904707 -10,0.42033460704259445,0.10021703881198853 -11,0.3979172260202459,0.11227880527684016 -12,0.3992377842624971,0.13834960978465374 -13,0.36351812430049024,0.14637482631499601 -14,0.36086605127470145,0.15339700393049752 -15,0.27803229144747893,0.17519153912543511 -16,0.24764306057751692,0.18163489682652323 -17,0.2546247662068935,0.1920283064393974 -18,0.27281600254442556,0.1968887014348958 -19,0.2705186834360297,0.22320550740329767 -20,0.29969231483298964,0.2278559856358303 -21,0.31507175636228785,0.21631113790331308 -22,0.3170839571491974,0.23240216910856668 -23,0.3201062001500274,0.23590483919206368 -24,0.24788100874579763,0.24413294581937137 -25,0.24932439019964475,0.2514892577758059 -26,0.25682838168308425,0.2573251636281981 -27,0.25691108409617125,0.2629575908594159 -28,0.2606141413445487,0.27452933746874875 -29,0.2538973293819504,0.27138907448677696 -30,0.26300677134410877,0.27608901099722993 -31,0.25958787047274295,0.2734068655042204 -32,0.2620577633391267,0.26668692055998694 -33,0.2677665846189286,0.27190541883537933 -34,0.2698493206362974,0.2737953942290021 -35,0.26871523120875485,0.2792490491212266 -36,0.2706224838853019,0.285255446778457 -37,0.27443698923839605,0.28791640737048424 -38,0.27814176822064324,0.279340819685821 -39,0.27606460269195954,0.27183941054653343 -40,0.27307694582354536,0.2763117563875985 -41,0.27681213230462487,0.28090687018324295 -42,0.2763401507651925,0.2823757748346625 -43,0.2905756087008992,0.284106939165533 -44,0.2901305465431984,0.28892051024774673 -45,0.2866682178814574,0.31067801214219776 -46,0.2748147271365624,0.33271255209429573 -47,0.2824570955002154,0.33809240352772785 -48,0.28532199483886955,0.342466005901906 -49,0.3194897125271686,0.3472678571815208 -50,0.32227149882332984,0.3494143489069156 -51,0.32507315719109064,0.3553262507378467 -52,0.3483927215781152,0.3577437943048381 -53,0.35116863658039477,0.36265544445738723 -54,0.35397195095412226,0.3652777392559547 -55,0.35681293946359083,0.3635299110583668 -56,0.35970774816697515,0.36657235082485046 -57,0.36268328716123316,0.369801843033111 -58,0.365787994620889,0.373236109725014 -59,0.36911874689499113,0.3768951711426859 -60,0.3899012348681259,0.38080172755114144 -61,0.3925538526988892,0.38498161884368615 -62,0.3952064705296526,0.38946438071227807 -63,0.40051170619117926,0.39428392137375445 -64,0.4003435986724249,0.4003771346837245 +3,0.4340479638200015,0.5988635577957939 +4,0.4446910360233003,0.5768968986366794 +5,0.44970649031040394,0.5600582308059449 +6,0.42291318953408236,0.4754961922118064 +7,0.37302776265331616,0.4716129791423394 +8,0.16585341129364783,0.4215235973451702 +9,0.1805562766904707,0.4270412461769427 +10,0.10021703881198853,0.42033460704259445 +11,0.11227880527684016,0.3979172260202459 +12,0.13834960978465374,0.3992377842624971 +13,0.14637482631499601,0.36351812430049024 +14,0.15339700393049752,0.36086605127470145 +15,0.17519153912543511,0.27803229144747893 +16,0.18163489682652323,0.24764306057751692 +17,0.1920283064393974,0.2546247662068935 +18,0.1968887014348958,0.27281600254442556 +19,0.22320550740329767,0.2705186834360297 +20,0.2278559856358303,0.29969231483298964 +21,0.21631113790331308,0.31507175636228785 +22,0.23240216910856668,0.3170839571491974 +23,0.23590483919206368,0.3201062001500274 +24,0.24413294581937137,0.24788100874579763 +25,0.2514892577758059,0.24932439019964475 +26,0.2573251636281981,0.25682838168308425 +27,0.2629575908594159,0.25691108409617125 +28,0.27452933746874875,0.2606141413445487 +29,0.27138907448677696,0.2538973293819504 +30,0.27608901099722993,0.26300677134410877 +31,0.2734068655042204,0.25958787047274295 +32,0.26668692055998694,0.2620577633391267 +33,0.27190541883537933,0.2677665846189286 +34,0.2737953942290021,0.2698493206362974 +35,0.2792490491212266,0.26871523120875485 +36,0.285255446778457,0.2706224838853019 +37,0.28791640737048424,0.27443698923839605 +38,0.279340819685821,0.27814176822064324 +39,0.27183941054653343,0.27606460269195954 +40,0.2763117563875985,0.27307694582354536 +41,0.28090687018324295,0.27681213230462487 +42,0.2823757748346625,0.2763401507651925 +43,0.284106939165533,0.2905756087008992 +44,0.28892051024774673,0.2901305465431984 +45,0.31067801214219776,0.2866682178814574 +46,0.33271255209429573,0.2748147271365624 +47,0.33809240352772785,0.2824570955002154 +48,0.342466005901906,0.28532199483886955 +49,0.3472678571815208,0.3194897125271686 +50,0.3494143489069156,0.32227149882332984 +51,0.3553262507378467,0.32507315719109064 +52,0.3577437943048381,0.3483927215781152 +53,0.36265544445738723,0.35116863658039477 +54,0.3652777392559547,0.35397195095412226 +55,0.3635299110583668,0.35681293946359083 +56,0.36657235082485046,0.35970774816697515 +57,0.369801843033111,0.36268328716123316 +58,0.373236109725014,0.365787994620889 +59,0.3768951711426859,0.36911874689499113 +60,0.38080172755114144,0.3899012348681259 +61,0.38498161884368615,0.3925538526988892 +62,0.38946438071227807,0.3952064705296526 +63,0.39428392137375445,0.40051170619117926 +64,0.4003771346837245,0.4003435986724249 diff --git a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_stats.csv b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_stats.csv new file mode 100644 index 0000000..b0f9ae7 --- /dev/null +++ b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_stats.csv @@ -0,0 +1,127 @@ +,algorithm,k,min,mean,max +0,hierarchical,2,1,53.0,105 +1,k_means,2,1,53.0,105 +2,hierarchical,3,1,35.333333333333336,104 +3,k_means,3,1,35.333333333333336,104 +4,hierarchical,4,1,26.5,102 +5,k_means,4,1,26.5,102 +6,hierarchical,5,1,21.2,102 +7,k_means,5,1,21.2,101 +8,hierarchical,6,1,17.666666666666668,99 +9,k_means,6,1,17.666666666666668,99 +10,hierarchical,7,1,15.142857142857142,98 +11,k_means,7,1,15.142857142857142,98 +12,hierarchical,8,1,13.25,96 +13,k_means,8,1,13.25,91 +14,hierarchical,9,1,11.777777777777779,96 +15,k_means,9,1,11.777777777777779,90 +16,hierarchical,10,1,10.6,95 +17,k_means,10,1,10.6,86 +18,hierarchical,11,1,9.636363636363637,94 +19,k_means,11,1,9.636363636363637,84 +20,hierarchical,12,1,8.833333333333334,93 +21,k_means,12,1,8.833333333333334,82 +22,hierarchical,13,1,8.153846153846153,91 +23,k_means,13,1,8.153846153846153,81 +24,hierarchical,14,1,7.571428571428571,91 +25,k_means,14,1,7.571428571428571,80 +26,hierarchical,15,1,7.066666666666666,83 +27,k_means,15,1,7.066666666666666,76 +28,hierarchical,16,1,6.625,83 +29,k_means,16,1,6.625,75 +30,hierarchical,17,1,6.235294117647059,78 +31,k_means,17,1,6.235294117647059,74 +32,hierarchical,18,1,5.888888888888889,78 +33,k_means,18,1,5.888888888888889,73 +34,hierarchical,19,1,5.578947368421052,78 +35,k_means,19,1,5.578947368421052,71 +36,hierarchical,20,1,5.3,71 +37,k_means,20,1,5.3,70 +38,hierarchical,21,1,5.0476190476190474,68 +39,k_means,21,1,5.0476190476190474,69 +40,hierarchical,22,1,4.818181818181818,68 +41,k_means,22,1,4.818181818181818,65 +42,hierarchical,23,1,4.608695652173913,68 +43,k_means,23,1,4.608695652173913,65 +44,hierarchical,24,1,4.416666666666667,64 +45,k_means,24,1,4.416666666666667,64 +46,hierarchical,25,1,4.24,64 +47,k_means,25,1,4.24,62 +48,hierarchical,26,1,4.076923076923077,64 +49,k_means,26,1,4.076923076923077,61 +50,hierarchical,27,1,3.925925925925926,64 +51,k_means,27,1,3.925925925925926,60 +52,hierarchical,28,1,3.7857142857142856,63 +53,k_means,28,1,3.7857142857142856,55 +54,hierarchical,29,1,3.6551724137931036,63 +55,k_means,29,1,3.6551724137931036,55 +56,hierarchical,30,1,3.533333333333333,63 +57,k_means,30,1,3.533333333333333,54 +58,hierarchical,31,1,3.4193548387096775,63 +59,k_means,31,1,3.4193548387096775,54 +60,hierarchical,32,1,3.3125,52 +61,k_means,32,1,3.3125,54 +62,hierarchical,33,1,3.212121212121212,52 +63,k_means,33,1,3.212121212121212,53 +64,hierarchical,34,1,3.1176470588235294,52 +65,k_means,34,1,3.1176470588235294,52 +66,hierarchical,35,1,3.0285714285714285,52 +67,k_means,35,1,3.0285714285714285,51 +68,hierarchical,36,1,2.9444444444444446,52 +69,k_means,36,1,2.9444444444444446,50 +70,hierarchical,37,1,2.864864864864865,52 +71,k_means,37,1,2.864864864864865,50 +72,hierarchical,38,1,2.789473684210526,51 +73,k_means,38,1,2.789473684210526,50 +74,hierarchical,39,1,2.717948717948718,51 +75,k_means,39,1,2.717948717948718,50 +76,hierarchical,40,1,2.65,51 +77,k_means,40,1,2.65,49 +78,hierarchical,41,1,2.5853658536585367,51 +79,k_means,41,1,2.5853658536585367,48 +80,hierarchical,42,1,2.5238095238095237,51 +81,k_means,42,1,2.5238095238095237,47 +82,hierarchical,43,1,2.4651162790697674,47 +83,k_means,43,1,2.4651162790697674,47 +84,hierarchical,44,1,2.409090909090909,47 +85,k_means,44,1,2.409090909090909,46 +86,hierarchical,45,1,2.3555555555555556,47 +87,k_means,45,1,2.3555555555555556,44 +88,hierarchical,46,1,2.3043478260869565,46 +89,k_means,46,1,2.3043478260869565,42 +90,hierarchical,47,1,2.25531914893617,25 +91,k_means,47,1,2.25531914893617,41 +92,hierarchical,48,1,2.2083333333333335,25 +93,k_means,48,1,2.2083333333333335,41 +94,hierarchical,49,1,2.163265306122449,25 +95,k_means,49,1,2.163265306122449,41 +96,hierarchical,50,1,2.12,25 +97,k_means,50,1,2.12,40 +98,hierarchical,51,1,2.0784313725490198,25 +99,k_means,51,1,2.0784313725490198,39 +100,hierarchical,52,1,2.0384615384615383,25 +101,k_means,52,1,2.0384615384615383,38 +102,hierarchical,53,1,2.0,25 +103,k_means,53,1,2.0,38 +104,hierarchical,54,1,1.962962962962963,25 +105,k_means,54,1,1.962962962962963,37 +106,hierarchical,55,1,1.9272727272727272,25 +107,k_means,55,1,1.9272727272727272,35 +108,hierarchical,56,1,1.8928571428571428,25 +109,k_means,56,1,1.8928571428571428,34 +110,hierarchical,57,1,1.8596491228070176,25 +111,k_means,57,1,1.8596491228070176,33 +112,hierarchical,58,1,1.8275862068965518,25 +113,k_means,58,1,1.8275862068965518,32 +114,hierarchical,59,1,1.7966101694915255,25 +115,k_means,59,1,1.7966101694915255,31 +116,hierarchical,60,1,1.7666666666666666,25 +117,k_means,60,1,1.7666666666666666,30 +118,hierarchical,61,1,1.7377049180327868,25 +119,k_means,61,1,1.7377049180327868,29 +120,hierarchical,62,1,1.7096774193548387,25 +121,k_means,62,1,1.7096774193548387,28 +122,hierarchical,63,1,1.6825396825396826,25 +123,k_means,63,1,1.6825396825396826,27 +124,hierarchical,64,1,1.65625,25 +125,k_means,64,1,1.65625,27 diff --git a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_stats.png b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_stats.png new file mode 100644 index 0000000..9ad8853 Binary files /dev/null and b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_stats.png differ diff --git a/clustering/org.apache.xerces.xinclude.XIncludeHandler_silhouette.csv b/clustering/org.apache.xerces.xinclude.XIncludeHandler_silhouette.csv index 71f2cc9..f303a3a 100644 --- a/clustering/org.apache.xerces.xinclude.XIncludeHandler_silhouette.csv +++ b/clustering/org.apache.xerces.xinclude.XIncludeHandler_silhouette.csv @@ -1,64 +1,64 @@ ,k_means,hierarchical -2,0.6855584100867681,0.6979818296524081 -3,0.6658312390685782,0.5363440260613704 +2,0.6979818296524081,0.6855584100867681 +3,0.5363440260613704,0.6658312390685782 4,0.5447405755407478,0.5447405755407478 -5,0.49469855877597974,0.4950944104608897 -6,0.49629928069605667,0.3167619075077442 -7,0.4937183177275972,0.3273304877495634 -8,0.4903327662796836,0.16989336921679118 -9,0.33791118198002373,0.17626101482318196 -10,0.2667320598048964,0.19790344008120894 -11,0.2629948976926796,0.1943827895435377 -12,0.25965211932671445,0.20556562870341602 -13,0.26164323937367595,0.20144552653966163 -14,0.25806076142240403,0.22548403695669203 -15,0.26997893998401756,0.22918194758667895 -16,0.27256451459055664,0.2404290571765335 -17,0.2608837912623233,0.2345015455494567 -18,0.241790230179569,0.2390816398182416 -19,0.24484234464495422,0.24094820256010968 -20,0.2361050155539465,0.2435369787081999 -21,0.23692336175194548,0.262463283756636 -22,0.23946566771940794,0.2742864390420934 -23,0.24594283942153175,0.2979619533428987 -24,0.24734609860636583,0.29936015461670856 -25,0.2372755932074588,0.30224440986202594 -26,0.24082497341896647,0.30600924875137986 -27,0.24547723657004195,0.3147862783718484 -28,0.2503460498700128,0.31887407003386015 -29,0.26565769288673047,0.3204052924706567 -30,0.2951470761811464,0.3106572702067674 -31,0.30158824153259317,0.31330774028648145 -32,0.3180875184494547,0.33210454757827634 -33,0.32604023717225655,0.340503634089749 -34,0.3162922149083673,0.33568440892081625 -35,0.31716183472339093,0.3545992807283562 -36,0.3214298482703343,0.3575403386841057 -37,0.32681546873349715,0.36503026576341707 -38,0.32732304963529885,0.3738835074795801 -39,0.32990135488218114,0.3929681262996284 -40,0.32432743478528314,0.38848432563159185 -41,0.3198436341172465,0.39163178463382314 -42,0.32643375049241685,0.3860578645369252 -43,0.32203761977337186,0.4053389048253979 -44,0.3346466456087704,0.4217820126417848 -45,0.33223356867673165,0.4257244699851672 -46,0.33057959050289126,0.42988772845220063 -47,0.34588619420359423,0.4342738007458362 -48,0.3328354412937589,0.4416623097378058 -49,0.33565227636252953,0.4426712673092602 -50,0.35908519690010676,0.4440749104193141 -51,0.3619904717574287,0.4456199850626709 -52,0.36212543071422687,0.45074758794403463 -53,0.3651696447072414,0.472336181881003 -54,0.3982691624564969,0.47730382490643575 -55,0.4013572084387477,0.454200815600248 -56,0.4246326548929088,0.4565666319605046 -57,0.44773608146478383,0.4522750157266781 -58,0.45157671443203573,0.4549088773042353 -59,0.4563204517812888,0.46145616231522185 -60,0.46300257860702615,0.4641242358826516 -61,0.4967563269695634,0.47006659021417746 -62,0.4994732063134373,0.47352022985333136 -63,0.49518159007961093,0.4775136217473302 -64,0.4872143751031807,0.4821848224907804 +5,0.4950944104608897,0.49469855877597974 +6,0.3167619075077442,0.49629928069605667 +7,0.3273304877495634,0.4937183177275972 +8,0.16989336921679118,0.4903327662796836 +9,0.17626101482318196,0.33791118198002373 +10,0.19790344008120894,0.2667320598048964 +11,0.1943827895435377,0.2629948976926796 +12,0.20556562870341602,0.25965211932671445 +13,0.20144552653966163,0.26164323937367595 +14,0.22548403695669203,0.25806076142240403 +15,0.22918194758667895,0.26997893998401756 +16,0.2404290571765335,0.27256451459055664 +17,0.2345015455494567,0.2608837912623233 +18,0.2390816398182416,0.241790230179569 +19,0.24094820256010968,0.24484234464495422 +20,0.2435369787081999,0.2361050155539465 +21,0.262463283756636,0.23692336175194548 +22,0.2742864390420934,0.23946566771940794 +23,0.2979619533428987,0.24594283942153175 +24,0.29936015461670856,0.24734609860636583 +25,0.30224440986202594,0.2372755932074588 +26,0.30600924875137986,0.24082497341896647 +27,0.3147862783718484,0.24547723657004195 +28,0.31887407003386015,0.2503460498700128 +29,0.3204052924706567,0.26565769288673047 +30,0.3106572702067674,0.2951470761811464 +31,0.31330774028648145,0.30158824153259317 +32,0.33210454757827634,0.3180875184494547 +33,0.340503634089749,0.32604023717225655 +34,0.33568440892081625,0.3162922149083673 +35,0.3545992807283562,0.31716183472339093 +36,0.3575403386841057,0.3214298482703343 +37,0.36503026576341707,0.32681546873349715 +38,0.3738835074795801,0.32732304963529885 +39,0.3929681262996284,0.32990135488218114 +40,0.38848432563159185,0.32432743478528314 +41,0.39163178463382314,0.3198436341172465 +42,0.3860578645369252,0.32643375049241685 +43,0.4053389048253979,0.32203761977337186 +44,0.4217820126417848,0.3346466456087704 +45,0.4257244699851672,0.33223356867673165 +46,0.42988772845220063,0.33057959050289126 +47,0.4342738007458362,0.34588619420359423 +48,0.4416623097378058,0.3328354412937589 +49,0.4426712673092602,0.33565227636252953 +50,0.4440749104193141,0.35908519690010676 +51,0.4456199850626709,0.3619904717574287 +52,0.45074758794403463,0.36212543071422687 +53,0.472336181881003,0.3651696447072414 +54,0.47730382490643575,0.3982691624564969 +55,0.454200815600248,0.4013572084387477 +56,0.4565666319605046,0.4246326548929088 +57,0.4522750157266781,0.44773608146478383 +58,0.4549088773042353,0.45157671443203573 +59,0.46145616231522185,0.4563204517812888 +60,0.4641242358826516,0.46300257860702615 +61,0.47006659021417746,0.4967563269695634 +62,0.47352022985333136,0.4994732063134373 +63,0.4775136217473302,0.49518159007961093 +64,0.4821848224907804,0.4872143751031807 diff --git a/clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.csv b/clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.csv new file mode 100644 index 0000000..9c11029 --- /dev/null +++ b/clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.csv @@ -0,0 +1,127 @@ +,algorithm,k,min,mean,max +0,hierarchical,2,2,54.0,106 +1,k_means,2,1,54.0,107 +2,hierarchical,3,1,36.0,106 +3,k_means,3,1,36.0,103 +4,hierarchical,4,1,27.0,102 +5,k_means,4,1,27.0,102 +6,hierarchical,5,1,21.6,101 +7,k_means,5,1,21.6,102 +8,hierarchical,6,1,18.0,100 +9,k_means,6,1,18.0,93 +10,hierarchical,7,1,15.428571428571429,99 +11,k_means,7,1,15.428571428571429,91 +12,hierarchical,8,1,13.5,99 +13,k_means,8,1,13.5,71 +14,hierarchical,9,1,12.0,90 +15,k_means,9,1,12.0,68 +16,hierarchical,10,1,10.8,80 +17,k_means,10,1,10.8,62 +18,hierarchical,11,1,9.818181818181818,80 +19,k_means,11,1,9.818181818181818,62 +20,hierarchical,12,1,9.0,80 +21,k_means,12,1,9.0,61 +22,hierarchical,13,1,8.307692307692308,79 +23,k_means,13,1,8.307692307692308,63 +24,hierarchical,14,1,7.714285714285714,77 +25,k_means,14,1,7.714285714285714,57 +26,hierarchical,15,1,7.2,77 +27,k_means,15,1,7.2,56 +28,hierarchical,16,1,6.75,76 +29,k_means,16,1,6.75,55 +30,hierarchical,17,1,6.352941176470588,76 +31,k_means,17,1,6.352941176470588,55 +32,hierarchical,18,1,6.0,65 +33,k_means,18,1,6.0,54 +34,hierarchical,19,1,5.684210526315789,65 +35,k_means,19,1,5.684210526315789,54 +36,hierarchical,20,1,5.4,65 +37,k_means,20,1,5.4,53 +38,hierarchical,21,1,5.142857142857143,65 +39,k_means,21,1,5.142857142857143,53 +40,hierarchical,22,1,4.909090909090909,64 +41,k_means,22,1,4.909090909090909,51 +42,hierarchical,23,1,4.695652173913044,64 +43,k_means,23,1,4.695652173913044,47 +44,hierarchical,24,1,4.5,64 +45,k_means,24,1,4.5,47 +46,hierarchical,25,1,4.32,34 +47,k_means,25,1,4.32,46 +48,hierarchical,26,1,4.153846153846154,34 +49,k_means,26,1,4.153846153846154,45 +50,hierarchical,27,1,4.0,34 +51,k_means,27,1,4.0,42 +52,hierarchical,28,1,3.857142857142857,34 +53,k_means,28,1,3.857142857142857,41 +54,hierarchical,29,1,3.7241379310344827,34 +55,k_means,29,1,3.7241379310344827,41 +56,hierarchical,30,1,3.6,34 +57,k_means,30,1,3.6,41 +58,hierarchical,31,1,3.4838709677419355,34 +59,k_means,31,1,3.4838709677419355,40 +60,hierarchical,32,1,3.375,34 +61,k_means,32,1,3.375,38 +62,hierarchical,33,1,3.272727272727273,34 +63,k_means,33,1,3.272727272727273,36 +64,hierarchical,34,1,3.176470588235294,34 +65,k_means,34,1,3.176470588235294,36 +66,hierarchical,35,1,3.085714285714286,34 +67,k_means,35,1,3.085714285714286,34 +68,hierarchical,36,1,3.0,34 +69,k_means,36,1,3.0,33 +70,hierarchical,37,1,2.918918918918919,34 +71,k_means,37,1,2.918918918918919,31 +72,hierarchical,38,1,2.8421052631578947,34 +73,k_means,38,1,2.8421052631578947,31 +74,hierarchical,39,1,2.769230769230769,33 +75,k_means,39,1,2.769230769230769,29 +76,hierarchical,40,1,2.7,33 +77,k_means,40,1,2.7,29 +78,hierarchical,41,1,2.6341463414634148,33 +79,k_means,41,1,2.6341463414634148,28 +80,hierarchical,42,1,2.5714285714285716,33 +81,k_means,42,1,2.5714285714285716,28 +82,hierarchical,43,1,2.511627906976744,33 +83,k_means,43,1,2.511627906976744,26 +84,hierarchical,44,1,2.4545454545454546,33 +85,k_means,44,1,2.4545454545454546,26 +86,hierarchical,45,1,2.4,33 +87,k_means,45,1,2.4,25 +88,hierarchical,46,1,2.347826086956522,33 +89,k_means,46,1,2.347826086956522,24 +90,hierarchical,47,1,2.297872340425532,33 +91,k_means,47,1,2.297872340425532,23 +92,hierarchical,48,1,2.25,21 +93,k_means,48,1,2.25,23 +94,hierarchical,49,1,2.204081632653061,20 +95,k_means,49,1,2.204081632653061,23 +96,hierarchical,50,1,2.16,18 +97,k_means,50,1,2.16,22 +98,hierarchical,51,1,2.1176470588235294,17 +99,k_means,51,1,2.1176470588235294,21 +100,hierarchical,52,1,2.076923076923077,16 +101,k_means,52,1,2.076923076923077,20 +102,hierarchical,53,1,2.0377358490566038,16 +103,k_means,53,1,2.0377358490566038,18 +104,hierarchical,54,1,2.0,16 +105,k_means,54,1,2.0,18 +106,hierarchical,55,1,1.9636363636363636,16 +107,k_means,55,1,1.9636363636363636,19 +108,hierarchical,56,1,1.9285714285714286,16 +109,k_means,56,1,1.9285714285714286,18 +110,hierarchical,57,1,1.894736842105263,16 +111,k_means,57,1,1.894736842105263,18 +112,hierarchical,58,1,1.8620689655172413,16 +113,k_means,58,1,1.8620689655172413,18 +114,hierarchical,59,1,1.8305084745762712,16 +115,k_means,59,1,1.8305084745762712,18 +116,hierarchical,60,1,1.8,16 +117,k_means,60,1,1.8,17 +118,hierarchical,61,1,1.7704918032786885,16 +119,k_means,61,1,1.7704918032786885,17 +120,hierarchical,62,1,1.7419354838709677,16 +121,k_means,62,1,1.7419354838709677,17 +122,hierarchical,63,1,1.7142857142857142,16 +123,k_means,63,1,1.7142857142857142,17 +124,hierarchical,64,1,1.6875,16 +125,k_means,64,1,1.6875,17 diff --git a/clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.png b/clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.png new file mode 100644 index 0000000..0a1275c Binary files /dev/null and b/clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.png differ diff --git a/prec_recall.py b/prec_recall.py index f81c325..f16a2bb 100755 --- a/prec_recall.py +++ b/prec_recall.py @@ -25,10 +25,16 @@ def intrapairs(path: str) -> set[set[str, str]]: def main(): filelist = glob.glob(IN_DIR + '/*_groundtruth.csv') + df_table = pd.DataFrame(columns=pd.MultiIndex.from_tuples([ + ('KMeans', 'Precision'), + ('KMeans', 'Recall'), + ('Agglomerative', 'Precision'), + ('Agglomerative', 'Recall')])) + df_table.index.name = 'Class Name' + for f in filelist: clazz_name = os.path.basename(f) clazz_name = clazz_name[:clazz_name.rfind('_groundtruth.csv')] - print(clazz_name) ground_pairs = intrapairs(f) for method in ['kmeans', 'hierarchical']: @@ -39,10 +45,15 @@ def main(): precision = n_common / len(cluster_pairs) recall = n_common / len(ground_pairs) - print(method + " precision: " + str(precision)) - print(method + " recall: " + str(recall)) + algo = 'KMeans' if method == 'kmeans' else 'Agglomerative' + + df_table.loc[clazz_name, [(algo, 'Precision'), (algo, 'Recall')]] = [ + str(round(precision * 100, 2)) + '%', + str(round(recall * 100, 2)) + '%' + ] - print() + df_table.columns = [x[0] + ' ' + x[1] for x in df_table.columns] + print(df_table.to_markdown()) if __name__ == '__main__': diff --git a/report/build.sh b/report/build.sh new file mode 100755 index 0000000..237654a --- /dev/null +++ b/report/build.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +SCRIPT_DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) + +cd "$SCRIPT_DIR" +pandoc main.md -o main.pdf \ No newline at end of file diff --git a/report/main.md b/report/main.md index 8afe3a5..7745c22 100644 --- a/report/main.md +++ b/report/main.md @@ -1,7 +1,7 @@ --- author: Claudio Maggioni title: Information Modelling & Analysis -- Project 1 -geometry: margin=2.5cm,bottom=3cm +geometry: margin=2cm,bottom=3cm --- +You may - at your own risk - also choose not to use this template. As long as +your submission is a latex-generated, English PDF containing all expected info, +you'll be fine. --> # Code Repository The code and result files part of this submission can be found at: -::: center -Repository: \url{https://github.com/infoMA2023/project-01-god-classes-maggicl} +::: center Repository: +\url{https://github.com/infoMA2023/project-01-god-classes-maggicl} -Commit ID: **TBD** -::: +Commit ID: **TBD** ::: # Data Pre-Processing ## God Classes -The first part of the project requires to label some classes of the _Xerces_ -project as "God classes" based on the number of methods each class has. +The first part of the project requires to label some classes of the _Xerces_ +project as "God classes" based on the number of methods each class has. From +here onwards the Java package prefix `org.apache.xerces` is omitted when discussing +fully qualified domain names of classes for sake of brevity. + Specifically, I label "God classes" the classes that have a number of methods -six times the standard deviation above the the mean number of methods, i.e. where -the condition +six times the standard deviation above the the mean number of methods, i.e. +where the condition $$|M(C)| > \mu(M) + 6\sigma(M)$$ holds. -To scan and compute the number of methods of each class I use the Python library `javalang`, which implements the Java AST and parser. The Python script -`./find_god_classes.py` uses this library to parse each file in the project and -compute the number of methods of each class. Note that only non-constructor methods are counted (specifically the code counts the number of `method` nodes in each `ClassDeclaration` node). +To scan and compute the number of methods of each class I use the Python library +`javalang`, which implements the Java AST and parser. The Python script +`./find_god_classes.py` uses this library to parse each file in the project and +compute the number of methods of each class. Note that only non-constructor +methods are counted (specifically the code counts the number of `method` nodes +in each `ClassDeclaration` node). -Then, the script computes mean and standard -deviation of the number of methods and filters the list of classes according to the -condition described above. The file `god_classes/god_classes.csv` then is outputted -listing all the god classes found. +Then, the script computes mean and standard deviation of the number of methods +and filters the list of classes according to the condition described above. The +file `god_classes/god_classes.csv` then is outputted listing all the god classes +found. -The god classes I identified, and their corresponding number of methods -can be found in Table [1](#tab:god_classes){reference-type="ref" +The god classes I identified, and their corresponding number of methods can be +found in Table [1](#tab:god_classes){reference-type="ref" reference="tab:god_classes"}. ::: {#tab:god_classes} | **Class Name** | **# Methods** | |:------------------------------------------------|------------:| -| org.apache.xerces.impl.xs.traversers.XSDHandler | 118 | -| org.apache.xerces.impl.dtd.DTDGrammar | 101 | -| org.apache.xerces.xinclude.XIncludeHandler | 116 | -| org.apache.xerces.dom.CoreDocumentImpl | 125 | +| impl.xs.traversers.XSDHandler | 118 | +| impl.dtd.DTDGrammar | 101 | +| xinclude.XIncludeHandler | 116 | +| dom.CoreDocumentImpl | 125 | : Identified God Classes ::: @@ -70,84 +73,198 @@ reference="tab:god_classes"}. ## Feature Vectors In this part of the project we produce the feature vectors used to later cluster -the methods of each God class into separate clusters. We produce one feature method per -non-constructor Java method in each god class. +the methods of each God class into separate clusters. We produce one feature +method per non-constructor Java method in each god class. -The columns of each vector represent -fields and methods referenced by each method, i.e. fields and methods actively used by the method in their method's body. +The columns of each vector represent fields and methods referenced by each +method, i.e. fields and methods actively used by the method in their method's +body. -When analyzing references to fields, additional constraints need to be specified to handle edge cases. -Namely, a field's property may be referenced (e.g. an access to array `a` may fetch its `length` property, i.e. `a.length`). In this -cases I consider the qualifier (i.e. the field itself, `a`) itself and not its property. When the qualifier is a class (i.e. -the code references a property of another class, e.g. `Integer.MAX_VALUE`) we consider the class name itself (i.e. `Integer`) and not -the name of the property. Should the qualifier be a subproperty itself (e.g. in `a.b.c`, where `a.b` would be the qualifier according to `javalang`) +When analyzing references to fields, additional constraints need to be specified +to handle edge cases. Namely, a field's property may be referenced (e.g. an +access to array `a` may fetch its `length` property, i.e. `a.length`). In this +cases I consider the qualifier (i.e. the field itself, `a`) itself and not its +property. When the qualifier is a class (i.e. the code references a property of +another class, e.g. `Integer.MAX_VALUE`) we consider the class name itself (i.e. +`Integer`) and not the name of the property. Should the qualifier be a +subproperty itself (e.g. in `a.b.c`, where `a.b` would be the qualifier +according to `javalang`) -For methods, I only consider calls to methods of the class itself where the qualifier is unspecified or `this`. Calls to parent methods -(i.e. calls like `super.something()`) are not considered. +For methods, I only consider calls to methods of the class itself where the +qualifier is unspecified or `this`. Calls to parent methods (i.e. calls like +`super.something()`) are not considered. -The feature vector extraction phase is performed by the Python script `extract_feature_vectors.py`. The script takes `god_classes/god_classes.csv` as input -and loads the AST of each class listed in it. Then, a list of all the fields and methods in the class is built, and each method is scanned to see which fields -and methods it references in its body according to the previously described rules. Then, a CSV per class is built storing all feature vectors. Each file has a name matching to the FQDN (Fully-qualified domain name) of the class. Each CSV row refers to a method in the class, and each CSV column refers to a field, method or referenced class. A cell has the value of 1 when the method of that row references the field, method or class marked by that column, and it has the value 0 otherwise. Columns with only zeros are omitted. +The feature vector extraction phase is performed by the Python script +`extract_feature_vectors.py`. The script takes `god_classes/god_classes.csv` as +input and loads the AST of each class listed in it. Then, a list of all the +fields and methods in the class is built, and each method is scanned to see +which fields and methods it references in its body according to the previously +described rules. Then, a CSV per class is built storing all feature vectors. +Each file has a name matching to the FQDN (Fully-qualified domain name) of the +class. Each CSV row refers to a method in the class, and each CSV column refers +to a field, method or referenced class. A cell has the value of 1 when the +method of that row references the field, method or class marked by that column, +and it has the value 0 otherwise. Columns with only zeros are omitted. -Table [2](#tab:feat_vec){reference-type="ref" reference="tab:feat_vec"} -shows aggregate numbers regarding the extracted feature vectors for the -god classes. Note that the number of attributes refers to the number of fields, methods or classes actually references (i.e. the number of columns after omission of 0s). +Table [2](#tab:feat_vec){reference-type="ref" reference="tab:feat_vec"} shows +aggregate numbers regarding the extracted feature vectors for the god classes. +Note that the number of attributes refers to the number of fields, methods or +classes actually references (i.e. the number of columns after omission of 0s). ::: {#tab:feat_vec} | **Class Name** | **# Feature Vectors** | **# Attributes\*** | |:------------------------------------------------|----------------------:|-----------------:| -| org.apache.xerces.impl.xs.traversers.XSDHandler | 106 | 183 | -| org.apache.xerces.impl.dtd.DTDGrammar | 91 | 106 | -| org.apache.xerces.xinclude.XIncludeHandler | 108 | 143 | -| org.apache.xerces.dom.CoreDocumentImpl | 117 | 63 | +| impl.xs.traversers.XSDHandler | 106 | 183 | +| impl.dtd.DTDGrammar | 91 | 106 | +| xinclude.XIncludeHandler | 108 | 143 | +| dom.CoreDocumentImpl | 117 | 63 | : Feature vector summary (\*= used at least once) ::: # Clustering {#sec:clustering} +In this section I covering the techniques to cluster the methods of each god +class. The project aims to use KMeans clustering and agglomerative hierarchical +clustering to group these methods toghether in cohesive units which could be +potentially refactored out of the god class they belong to. + ## Algorithm Configurations -Report/comment the algorithm configurations (distance function, linkage -rule, etc.). You may do so in any form you feel suited, but a short -paragraph of text is probably sufficient. +To perform KMeans clustering, I use the `cluster.KMeans` Scikit-Learn +implementation of the algorithm. I use the default parameters: feature vectors +are compared with euclidian distance, centroids are used instead of medioids, +and the initial centroids are computed with the greedy algorithm `kmeans++`. The +random seed is fixed to $0$ to allow for reproducibility between executions of +the clustering script. + +To perform Hierarchical clustering, I use the `cluster.AgglomerativeClustering` +Scikit-Learn implementation of the algorithm. Again feature vectors are +compared with euclidian distance, but as a linkage metric I choose to use +complete linkage. As agglomerative clustering is deternministic, no random seed +is needed for this algorithm. + +I run the two algorithms for all $k \in [2,65]$, or if less than 65 feature +vectors with distinct values are assigned to the god class, the upper bound of +$k$ is such value. ## Testing Various K & Silhouette Scores -\(1\) Report data about the clusters produced by the two algorithms at -various k (#clusters, size of clusters, silhouette scores). You may use -any suitable format (table, graph, \...). +To find the optimal value of $k$ for both algorithms, the distribution of +cluster sizes and silhouette across values of $k$, and to apply the optimal +clustering for each god class I run the command: -\(2\) Briefly comment your results. What is the best configuration, and -why? Anything else you observed? +```shell +./silhouette.py --validate --autorun +``` + +Feature vectors are read from the `feature_vectors` directory and all the +results are stored in the `clustering` directory. + +Figures [1](#fig:xsd){reference-type="ref" reference="fig:xsd"}, +[2](#fig:dtd){reference-type="ref" reference="fig:dtd"}, +[3](#fig:xinc){reference-type="ref" reference="fig:xinc"}, and +[4](#fig:cimpl){reference-type="ref" reference="fig:cimpl"} show the +distributions of cluster sizes for each god class obtained by running the KMeans +and agglomerative clustering algorithm as described in the previous sections. + +For all god classes, the mean of number of elements in each cluster +exponentially decreases as $k$ increases. Aside the first values of $k$ for +class `DTDGrammar` (where it was 2), the minimum cluster size was 1 for all +analyzed clusterings. Conversely, the maximum cluster size varies a lot, almost +always being monotonically non increasing as $k$ increases, occasionally forming +wide plateaus. The silhouette metric distribution instead generally follows a +dogleg-like path, sharply decreasing for the first values of $k$ and slowly +increasing afterwards $k$. This leads the choice of the optimal $k$ number of +clusters for each algorithm to be between really low and really high values. + +The figures also show the distribution of the silhouette metric per algorithm +and per value of $k$. The optimal values of $k$ and the respective silhouette +values for each implementation are reported in Table +[3](#tab:sumup){reference-type="ref" reference="tab:sumup"}. + +From the values we can gather that agglomerative clustering performs overall +better than KMeans for the god classes in the project. Almost god classes are +optimally clustered with few clusters, with the exception of `CoreDocumentImpl` +being optimally clustered with unit clusters. This could indicate higher +cohesion between implementation details of the other classes, and lower cohesion +in `CoreDocumentImpl` (given the name it would not be surprising if this class +plays the role of an utility class of sort, combining lots of implementation +details affecting different areas of the code). + +Agglomerative clustering with complete linkage could perform better than KMeans +due to a more urgent need for separation rather than cohesion in the classes +that were analyzed. Given the high dimensionality of the feature vectures used, +and the fact that eucledian distance is used to compare feature vectors, the +hyper-space of method features for each god class is likely sparse, with +occasional clusters of tightly-knit features. Given the prevailing sparsity, +complete linkage could be suitable here since it avoids to agglomerate distant +clusters above all. + +![Clustering metrics for class impl.xs.traversers.XSDHandler](../clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_stats.png){#fig:xsd} + +![Clustering metrics for class impl.dtd.DTDGrammar](../clustering/org.apache.xerces.impl.dtd.DTDGrammar_stats.png){#fig:dtd} + +![Clustering metrics for class xinclude.XIncludeHandler](../clustering/org.apache.xerces.xinclude.XIncludeHandler_stats.png){#fig:xinc} + +![Clustering metrics for class dom.CoreDocumentImpl](../clustering/org.apache.xerces.dom.CoreDocumentImpl_stats.png){#fig:cimpl} + +::: {#tab:sumup} +| **Class Name** | **KMeans K** | **KMeans silhouette** | **Hierarchical K** | **Hierarchical silhouette** | +|:------------- --------------|-----------:|--------------------:|-----------------:|--------------------------:| +| dom.CoreDocumentImpl | 45 |0.7290 | 45 | 0.7290 | +| impl.xs.traversers.XSDHandler | 2 |0.5986 | 3 | 0.5989 | +| impl.dtd.DTDGrammar | 58 |0.3980 | 2 | 0.4355 | +| xinclude.XIncludeHandler | 2 |0.6980 | 2 | 0.6856 | + + : Optimal hyperparameters and corresponding silhouette metrics for KMeans and +Hierarchical clustering algorithm. +::: # Evaluation ## Ground Truth -I computed the ground truth using the command \.... The generated files -are checked into the repository with the names \.... +I computed the ground truth using the Python script `./ground_truth.py` The +generated files are checked into the repository with the names +`clustering/{className}_groundtruth.csv` where `{className}` is the FQDN of each +god class. -Comment briefly on the strengths & weaknesses of our ground truth. +The ground truth in this project is not given but generated according to simple +heuristics. Since no inherent structure or labelling from experts exists to +group the methods in each god class, the project requires to label methods based +on keyword matching whitin each method name. The list of keywords used can be +found in `keyword_list.txt`. This approach allows to have a ground truth at all +with little computational cost and labelling effort, but it assumes the method +name and the chosen keywords are indeed of enough significance to form a +meaningful clustering of methods that form refactorable cohesive units of +functionality. ## Precision and Recall ::: {#tab:eval} - ---------------- ------------------- -------- ------------- -------- - **Class Name** **Agglomerative** **K-Means** - Prec. Recall Prec. Recall - \... \... \... \... \... - ---------------- ------------------- -------- ------------- -------- +| **Class Name** | **KMeans Precision** | **KMeans Recall** | **Agglomerative Precision** | **Agglomerative Recall** | +|:------------------------------------------------|-------------------:|----------------:|--------------------------:|-----------------------:| +| xinclude.XIncludeHandler | 69.83% | 97.80% | 69.58% | 95.65% | +| dom.CoreDocumentImpl | 64.80% | 28.26% | 68.11% | 29.70% | +| impl.xs.traversers.XSDHandler | 36.17% | 97.24% | 36.45% | 96.11% | +| impl.dtd.DTDGrammar | 87.65% | 6.87% | 52.21% | 94.28% | : Evaluation Summary ::: Precision and Recall, for the optimal configurations found in Section -[3](#sec:clustering){reference-type="ref" reference="sec:clustering"}, -are reported in Table [3](#tab:eval){reference-type="ref" -reference="tab:eval"}. +[3](#sec:clustering){reference-type="ref" reference="sec:clustering"}, are +reported in Table [4](#tab:eval){reference-type="ref" reference="tab:eval"}. + +\begin{center} +\color{red} comment precision and recall values +\end{center} ## Practical Usefulness -Discuss the practical usefulness of the obtained code refactoring -assistant in a realistic setting (1 paragraph). +\begin{center} +\color{red}Discuss the practical usefulness of the obtained code refactoring assistant in a +realistic setting (1 paragraph). +\end{center} + diff --git a/report/main.pdf b/report/main.pdf index 85bedc2..a146550 100644 Binary files a/report/main.pdf and b/report/main.pdf differ diff --git a/silhouette.py b/silhouette.py index b89d7ae..3e87916 100755 --- a/silhouette.py +++ b/silhouette.py @@ -8,6 +8,9 @@ import pandas as pd import argparse from k_means import cluster_kmeans from hierarchical import cluster_hierarchical +from collections import Counter +import seaborn as sns +import matplotlib.pyplot as plt DIR: str = os.path.dirname(os.path.realpath(__file__)) OUT_DIR: str = DIR + '/clustering' @@ -20,47 +23,91 @@ def clean_output(): filelist = glob.glob(OUT_DIR + '/*_silhouette.csv') for f in filelist: os.remove(f) + filelist = glob.glob(OUT_DIR + '/*.png') + for f in filelist: + os.remove(f) -def validate(path: str, clazz_name: str, autorun: bool): +def validate(path: str, clazz_name: str, autorun: bool, df_table): df = pd.DataFrame(columns=['k_means', 'hierarchical'], dtype=float) + df_stats = pd.DataFrame(columns=['algorithm', 'k', 'min', 'mean', 'max']) + + def add_stat(algo: str, k: int, Y: any, i: int): + y_occurs = list(Counter(Y).values()) # count number of elements in each cluster + df_stats.loc[i, :] = [algo, k, np.min(y_occurs), np.mean(y_occurs), np.max(y_occurs)] # We bound the number of clusters by the number of distinct points in our dataset. # To count them, we compute the number of "distinct" feature vectors and we # bound to the minimum of K_MAX and this number. nodup = pd.read_csv(path, index_col=0).drop_duplicates() max_distinct = len(nodup) - print("Max distinct:", max_distinct) + limit = min(K_MAX, max_distinct) - for n in range(2, min(K_MAX, max_distinct)): + i: int = 0 + for n in range(2, limit): X_h, Y_h = cluster_hierarchical(path, n, save_to_disk=False) - df.loc[n, 'k_means'] = silhouette_score(X_h, Y_h) + add_stat('hierarchical', n, Y_h, i) + i += 1 + df.loc[n, 'hierarchical'] = silhouette_score(X_h, Y_h) X_k, Y_k = cluster_kmeans(path, n, save_to_disk=False) - df.loc[n, 'hierarchical'] = silhouette_score(X_k, Y_k) + add_stat('k_means', n, Y_k, i) + i += 1 + df.loc[n, 'k_means'] = silhouette_score(X_k, Y_k) k_kmeans = df[['k_means']].idxmax()[0] k_hierarchical = df[['hierarchical']].idxmax()[0] - print("K_means optimal value: " + str(k_kmeans)) - print("Hierarchical optimal value: " + str(k_hierarchical)) + df_table.loc[clazz_name] = [k_kmeans, 0, k_hierarchical, 0] df.to_csv(OUT_DIR + '/' + clazz_name + '_silhouette.csv') + df_stats.to_csv(OUT_DIR + '/' + clazz_name + '_stats.csv') if autorun: cluster_hierarchical(path, k_hierarchical) cluster_kmeans(path, k_kmeans) + # Plot stats + sns.set_theme(palette="hls") + + # Initialize the matplotlib figure + f = plt.figure(figsize=(14, 12)) + gs = f.add_gridspec(2, 2) + ax1 = f.add_subplot(gs[0, 0]) + ax2 = f.add_subplot(gs[0, 1]) + ax3 = f.add_subplot(gs[1, :]) + + df_k = df_stats.loc[df_stats.algorithm == 'k_means', ['k', 'min', 'mean', 'max']].set_index('k', drop=True) + df_h = df_stats.loc[df_stats.algorithm == 'hierarchical', ['k', 'min', 'mean', 'max']].set_index('k', drop=True) + + sns.lineplot(data=df_k, palette="tab10", ax=ax1) + sns.lineplot(data=df_h, palette="tab10", ax=ax2) + sns.lineplot(data=df, palette="tab10", ax=ax3) + + # Add a legend and informative axis label + ax1.set(ylabel="# of elements", ylim=[0, 130], xlabel="# of clusters", xlim=[2, limit]) + ax1.set_title("K-Means cluster sizes") + ax2.set(ylabel="# of elements", ylim=[0, 130], xlabel="# of clusters", xlim=[2, limit]) + ax2.set_title("Hierarchical cluster sizes") + ax3.set(ylabel="Silhouette", ylim=[0, 1], xlabel="# of clusters", xlim=[2, limit]) + ax3.set_title("Silhouette metrics per # of clusters") + + sns.despine(left=True, bottom=True) + f.savefig(OUT_DIR + '/' + clazz_name + '_stats.png') + plt.clf() -def compute_silhouette(path: str, clazz_name: str, suffix: str): +def compute_silhouette(path: str, clazz_name: str, suffix: str) -> float: df_y = pd.read_csv(OUT_DIR + '/' + clazz_name + '_' + suffix + '.csv') Y = df_y.iloc[:, 1].values df = pd.read_csv(path) X = df.drop(df.columns[0], axis=1).to_numpy() - print("Silhouette for " + suffix + ": " + str(silhouette_score(X, Y))) + s = round(silhouette_score(X, Y), 4) + + print("Silhouette for " + suffix + ": " + str(s)) + return s def main(): @@ -70,26 +117,30 @@ def main(): parser.add_argument('--autorun', action='store_true', help='if validating, computes CSV for optimal clustering automatically') - args = parser.parse_args() if args.validate: clean_output() + df_table = pd.DataFrame(columns=['KMeans K', 'KMeans silhouette', 'Hierarchical K', 'Hierarchical silhouette']) + filelist = glob.glob(IN_DIR + '/*.csv') for f in filelist: clazz_name = os.path.basename(f) clazz_name = clazz_name[:clazz_name.rfind('.')] - print(clazz_name) + if args.validate: + validate(f, clazz_name, args.autorun, df_table) + + sk = compute_silhouette(f, clazz_name, 'kmeans') + sh = compute_silhouette(f, clazz_name, 'hierarchical') if args.validate: - validate(f, clazz_name, args.autorun) + df_table.loc[clazz_name, 'KMeans silhouette'] = sk + df_table.loc[clazz_name, 'Hierarchical silhouette'] = sh - compute_silhouette(f, clazz_name, 'kmeans') - compute_silhouette(f, clazz_name, 'hierarchical') - - print() + df_table.index.name = 'Class Name' + print(df_table.to_markdown()) if __name__ == '__main__':