From 703b77a39ddbdd9f87a1b7024dc2f08b2eca7b82 Mon Sep 17 00:00:00 2001 From: Claudio Maggioni Date: Wed, 22 Mar 2023 14:28:17 +0100 Subject: [PATCH] done part 3 and part 4 --- ...erces.dom.CoreDocumentImpl_groundtruth.csv | 118 ++++++++++++++++++ ...rces.dom.CoreDocumentImpl_hierarchical.csv | 118 ++++++++++++++++++ ...che.xerces.dom.CoreDocumentImpl_kmeans.csv | 118 ++++++++++++++++++ ...xerces.dom.CoreDocumentImpl_silhouette.csv | 9 ++ ...xerces.impl.dtd.DTDGrammar_groundtruth.csv | 92 ++++++++++++++ ...erces.impl.dtd.DTDGrammar_hierarchical.csv | 92 ++++++++++++++ ...ache.xerces.impl.dtd.DTDGrammar_kmeans.csv | 92 ++++++++++++++ ....xerces.impl.dtd.DTDGrammar_silhouette.csv | 16 +++ ...l.xs.traversers.XSDHandler_groundtruth.csv | 107 ++++++++++++++++ ....xs.traversers.XSDHandler_hierarchical.csv | 107 ++++++++++++++++ ...s.impl.xs.traversers.XSDHandler_kmeans.csv | 107 ++++++++++++++++ ...pl.xs.traversers.XSDHandler_silhouette.csv | 14 +++ ...s.xinclude.XIncludeHandler_groundtruth.csv | 109 ++++++++++++++++ ....xinclude.XIncludeHandler_hierarchical.csv | 109 ++++++++++++++++ ...xerces.xinclude.XIncludeHandler_kmeans.csv | 109 ++++++++++++++++ ...es.xinclude.XIncludeHandler_silhouette.csv | 15 +++ ground_truth.py | 47 +++++++ hierarchical.py | 49 ++++++++ k_means.py | 52 ++++++++ keyword_list.txt | 14 +++ prec_recall.py | 49 ++++++++ readme.md | 28 ++++- requirements.txt | 4 +- silhouette.py | 95 ++++++++++++++ 24 files changed, 1667 insertions(+), 3 deletions(-) create mode 100644 clustering/org.apache.xerces.dom.CoreDocumentImpl_groundtruth.csv create mode 100644 clustering/org.apache.xerces.dom.CoreDocumentImpl_hierarchical.csv create mode 100644 clustering/org.apache.xerces.dom.CoreDocumentImpl_kmeans.csv create mode 100644 clustering/org.apache.xerces.dom.CoreDocumentImpl_silhouette.csv create mode 100644 clustering/org.apache.xerces.impl.dtd.DTDGrammar_groundtruth.csv create mode 100644 clustering/org.apache.xerces.impl.dtd.DTDGrammar_hierarchical.csv create mode 100644 clustering/org.apache.xerces.impl.dtd.DTDGrammar_kmeans.csv create mode 100644 clustering/org.apache.xerces.impl.dtd.DTDGrammar_silhouette.csv create mode 100644 clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_groundtruth.csv create mode 100644 clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_hierarchical.csv create mode 100644 clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_kmeans.csv create mode 100644 clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_silhouette.csv create mode 100644 clustering/org.apache.xerces.xinclude.XIncludeHandler_groundtruth.csv create mode 100644 clustering/org.apache.xerces.xinclude.XIncludeHandler_hierarchical.csv create mode 100644 clustering/org.apache.xerces.xinclude.XIncludeHandler_kmeans.csv create mode 100644 clustering/org.apache.xerces.xinclude.XIncludeHandler_silhouette.csv create mode 100755 ground_truth.py create mode 100755 hierarchical.py create mode 100755 k_means.py create mode 100644 keyword_list.txt create mode 100755 prec_recall.py create mode 100755 silhouette.py diff --git a/clustering/org.apache.xerces.dom.CoreDocumentImpl_groundtruth.csv b/clustering/org.apache.xerces.dom.CoreDocumentImpl_groundtruth.csv new file mode 100644 index 0000000..9415fde --- /dev/null +++ b/clustering/org.apache.xerces.dom.CoreDocumentImpl_groundtruth.csv @@ -0,0 +1,118 @@ +,cluster +getOwnerDocument,0 +getNodeType,0 +getNodeName,0 +cloneNode,0 +insertBefore,0 +removeChild,0 +replaceChild,0 +getTextContent,10 +setTextContent,10 +getFeature,0 +createAttribute,1 +createCDATASection,1 +createComment,1 +createDocumentFragment,1 +createElement,1 +createEntityReference,1 +createProcessingInstruction,1 +createTextNode,1 +getDoctype,0 +getDocumentElement,0 +getElementsByTagName,0 +getImplementation,0 +setErrorChecking,9 +setStrictErrorChecking,9 +getErrorChecking,9 +getStrictErrorChecking,9 +getInputEncoding,6 +setInputEncoding,6 +setXmlEncoding,6 +setEncoding,6 +getXmlEncoding,6 +getEncoding,6 +setXmlVersion,0 +setVersion,0 +getXmlVersion,0 +getVersion,0 +setXmlStandalone,5 +setStandalone,5 +getXmlStandalone,5 +getStandalone,5 +getDocumentURI,4 +canRenameElements,0 +renameNode,0 +replaceRenameElement,0 +normalizeDocument,0 +getDomConfig,0 +getBaseURI,4 +setDocumentURI,4 +getAsync,0 +setAsync,0 +abort,0 +load,0 +loadXML,0 +saveXML,0 +setMutationEvents,0 +getMutationEvents,0 +createDocumentType,1 +createEntity,1 +createNotation,1 +createElementDefinition,1 +getNodeNumber,0 +importNode,0 +adoptNode,0 +undeferChildren,0 +getElementById,0 +clearIdentifiers,7 +putIdentifier,7 +getIdentifier,7 +removeIdentifier,7 +getIdentifiers,7 +createElementNS,1 +createAttributeNS,1 +getElementsByTagNameNS,0 +clone,0 +isXMLName,0 +isValidQName,0 +isKidOK,0 +changed,0 +changes,0 +getNodeListCache,3 +freeNodeListCache,3 +setUserData,8 +getUserData,8 +getUserDataRecord,8 +removeUserDataTable,8 +setUserDataTable,8 +callUserDataHandlers,8 +checkNamespaceWF,0 +checkDOMNSErr,0 +checkQName,0 +isXML11Version,0 +isNormalizeDocRequired,0 +isXMLVersionChanged,0 +addEventListener,0 +removeEventListener,0 +copyEventListeners,0 +dispatchEvent,0 +replacedText,0 +deletedText,0 +insertedText,0 +modifyingCharacterData,0 +modifiedCharacterData,0 +insertingNode,0 +insertedNode,0 +removingNode,0 +removedNode,0 +replacingNode,0 +replacedNode,0 +replacingData,0 +replacedCharacterData,0 +modifiedAttrValue,0 +setAttrNode,0 +removedAttrNode,0 +renamedAttrNode,0 +renamedElement,0 +readObject,2 +writeObject,2 diff --git a/clustering/org.apache.xerces.dom.CoreDocumentImpl_hierarchical.csv b/clustering/org.apache.xerces.dom.CoreDocumentImpl_hierarchical.csv new file mode 100644 index 0000000..ba33db8 --- /dev/null +++ b/clustering/org.apache.xerces.dom.CoreDocumentImpl_hierarchical.csv @@ -0,0 +1,118 @@ +,cluster +getOwnerDocument,0 +getNodeType,3 +getNodeName,0 +cloneNode,0 +insertBefore,7 +removeChild,3 +replaceChild,7 +getTextContent,0 +setTextContent,0 +getFeature,0 +createAttribute,2 +createCDATASection,0 +createComment,0 +createDocumentFragment,0 +createElement,2 +createEntityReference,2 +createProcessingInstruction,2 +createTextNode,0 +getDoctype,0 +getDocumentElement,0 +getElementsByTagName,0 +getImplementation,0 +setErrorChecking,0 +setStrictErrorChecking,0 +getErrorChecking,0 +getStrictErrorChecking,0 +getInputEncoding,0 +setInputEncoding,0 +setXmlEncoding,0 +setEncoding,0 +getXmlEncoding,0 +getEncoding,0 +setXmlVersion,2 +setVersion,0 +getXmlVersion,0 +getVersion,0 +setXmlStandalone,0 +setStandalone,0 +getXmlStandalone,0 +getStandalone,0 +getDocumentURI,0 +canRenameElements,0 +renameNode,1 +replaceRenameElement,6 +normalizeDocument,0 +getDomConfig,0 +getBaseURI,0 +setDocumentURI,0 +getAsync,0 +setAsync,2 +abort,0 +load,0 +loadXML,0 +saveXML,2 +setMutationEvents,0 +getMutationEvents,0 +createDocumentType,0 +createEntity,2 +createNotation,2 +createElementDefinition,2 +getNodeNumber,0 +importNode,8 +adoptNode,1 +undeferChildren,0 +getElementById,0 +clearIdentifiers,0 +putIdentifier,0 +getIdentifier,0 +removeIdentifier,0 +getIdentifiers,0 +createElementNS,0 +createAttributeNS,0 +getElementsByTagNameNS,0 +clone,0 +isXMLName,0 +isValidQName,0 +isKidOK,3 +changed,0 +changes,0 +getNodeListCache,4 +freeNodeListCache,0 +setUserData,0 +getUserData,0 +getUserDataRecord,0 +removeUserDataTable,0 +setUserDataTable,0 +callUserDataHandlers,0 +checkNamespaceWF,2 +checkDOMNSErr,5 +checkQName,2 +isXML11Version,0 +isNormalizeDocRequired,0 +isXMLVersionChanged,0 +addEventListener,0 +removeEventListener,0 +copyEventListeners,0 +dispatchEvent,0 +replacedText,0 +deletedText,0 +insertedText,0 +modifyingCharacterData,0 +modifiedCharacterData,0 +insertingNode,0 +insertedNode,0 +removingNode,0 +removedNode,0 +replacingNode,0 +replacedNode,0 +replacingData,0 +replacedCharacterData,0 +modifiedAttrValue,0 +setAttrNode,0 +removedAttrNode,0 +renamedAttrNode,0 +renamedElement,0 +readObject,0 +writeObject,0 diff --git a/clustering/org.apache.xerces.dom.CoreDocumentImpl_kmeans.csv b/clustering/org.apache.xerces.dom.CoreDocumentImpl_kmeans.csv new file mode 100644 index 0000000..55b1793 --- /dev/null +++ b/clustering/org.apache.xerces.dom.CoreDocumentImpl_kmeans.csv @@ -0,0 +1,118 @@ +,cluster +getOwnerDocument,0 +getNodeType,3 +getNodeName,0 +cloneNode,8 +insertBefore,2 +removeChild,3 +replaceChild,2 +getTextContent,0 +setTextContent,0 +getFeature,0 +createAttribute,1 +createCDATASection,0 +createComment,0 +createDocumentFragment,0 +createElement,1 +createEntityReference,1 +createProcessingInstruction,1 +createTextNode,0 +getDoctype,0 +getDocumentElement,0 +getElementsByTagName,0 +getImplementation,0 +setErrorChecking,0 +setStrictErrorChecking,0 +getErrorChecking,0 +getStrictErrorChecking,0 +getInputEncoding,0 +setInputEncoding,0 +setXmlEncoding,0 +setEncoding,0 +getXmlEncoding,0 +getEncoding,0 +setXmlVersion,1 +setVersion,0 +getXmlVersion,0 +getVersion,0 +setXmlStandalone,0 +setStandalone,0 +getXmlStandalone,0 +getStandalone,0 +getDocumentURI,0 +canRenameElements,0 +renameNode,4 +replaceRenameElement,5 +normalizeDocument,0 +getDomConfig,0 +getBaseURI,0 +setDocumentURI,0 +getAsync,0 +setAsync,1 +abort,0 +load,0 +loadXML,0 +saveXML,1 +setMutationEvents,0 +getMutationEvents,0 +createDocumentType,0 +createEntity,1 +createNotation,1 +createElementDefinition,1 +getNodeNumber,0 +importNode,6 +adoptNode,4 +undeferChildren,0 +getElementById,0 +clearIdentifiers,0 +putIdentifier,0 +getIdentifier,0 +removeIdentifier,0 +getIdentifiers,0 +createElementNS,0 +createAttributeNS,0 +getElementsByTagNameNS,0 +clone,0 +isXMLName,0 +isValidQName,0 +isKidOK,3 +changed,0 +changes,0 +getNodeListCache,0 +freeNodeListCache,0 +setUserData,0 +getUserData,0 +getUserDataRecord,0 +removeUserDataTable,0 +setUserDataTable,0 +callUserDataHandlers,0 +checkNamespaceWF,1 +checkDOMNSErr,7 +checkQName,1 +isXML11Version,0 +isNormalizeDocRequired,0 +isXMLVersionChanged,0 +addEventListener,0 +removeEventListener,0 +copyEventListeners,0 +dispatchEvent,0 +replacedText,0 +deletedText,0 +insertedText,0 +modifyingCharacterData,0 +modifiedCharacterData,0 +insertingNode,0 +insertedNode,0 +removingNode,0 +removedNode,0 +replacingNode,0 +replacedNode,0 +replacingData,0 +replacedCharacterData,0 +modifiedAttrValue,0 +setAttrNode,0 +removedAttrNode,0 +renamedAttrNode,0 +renamedElement,0 +readObject,0 +writeObject,0 diff --git a/clustering/org.apache.xerces.dom.CoreDocumentImpl_silhouette.csv b/clustering/org.apache.xerces.dom.CoreDocumentImpl_silhouette.csv new file mode 100644 index 0000000..6451504 --- /dev/null +++ b/clustering/org.apache.xerces.dom.CoreDocumentImpl_silhouette.csv @@ -0,0 +1,9 @@ +,k_means,hierarchical +2,0.880893568779594,0.880893568779594 +3,0.865058028908998,0.8832576727129332 +4,0.8716739969277025,0.9044225339758202 +5,0.8801678593155939,0.9033046629733426 +6,0.9027364726730837,0.9090951948593619 +7,0.9172474681026531,0.9392626024065982 +8,0.9426743010154992,0.9453062501090475 +9,0.9512213095625077,0.9512213095625078 diff --git a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_groundtruth.csv b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_groundtruth.csv new file mode 100644 index 0000000..e470d78 --- /dev/null +++ b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_groundtruth.csv @@ -0,0 +1,92 @@ +,cluster +getGrammarDescription,0 +getElementDeclIsExternal,0 +getAttributeDeclIsExternal,0 +getAttributeDeclIndex,0 +startDTD,0 +startParameterEntity,11 +startExternalSubset,12 +endParameterEntity,11 +endExternalSubset,12 +elementDecl,0 +attributeDecl,0 +internalEntityDecl,0 +externalEntityDecl,0 +unparsedEntityDecl,0 +notationDecl,0 +endDTD,0 +setDTDSource,0 +getDTDSource,0 +textDecl,0 +comment,0 +processingInstruction,0 +startAttlist,0 +endAttlist,0 +startConditional,0 +ignoredCharacters,0 +endConditional,0 +setDTDContentModelSource,10 +getDTDContentModelSource,10 +startContentModel,10 +startGroup,0 +pcdata,0 +element,0 +separator,0 +occurrence,0 +endGroup,0 +any,0 +empty,0 +endContentModel,10 +isNamespaceAware,0 +getSymbolTable,0 +getFirstElementDeclIndex,0 +getNextElementDeclIndex,0 +getElementDeclIndex,0 +getContentSpecType,10 +getElementDecl,0 +getElementDeclName,0 +getFirstAttributeDeclIndex,0 +getNextAttributeDeclIndex,0 +getAttributeDecl,0 +isCDATAAttribute,0 +getEntityDeclIndex,0 +getEntityDecl,0 +getNotationDeclIndex,0 +getNotationDecl,0 +getContentSpec,10 +getContentSpecIndex,10 +getContentSpecAsString,10 +printElements,0 +printAttributes,0 +addContentSpecToElement,10 +getElementContentModelValidator,10 +createElementDecl,1 +setElementDecl,0 +putElementNameMapping,0 +setFirstAttributeDeclIndex,0 +setContentSpecIndex,10 +createAttributeDecl,1 +setAttributeDecl,0 +createContentSpec,1 +setContentSpec,10 +createEntityDecl,1 +setEntityDecl,0 +createNotationDecl,1 +setNotationDecl,0 +addContentSpecNode,10 +addUniqueLeafNode,0 +initializeContentModelStack,10 +isImmutable,0 +appendContentSpec,10 +printAttribute,0 +createChildModel,1 +buildSyntaxTree,0 +contentSpecTree,10 +ensureElementDeclCapacity,0 +ensureAttributeDeclCapacity,0 +ensureEntityDeclCapacity,0 +ensureNotationDeclCapacity,0 +ensureContentSpecCapacity,10 +resize,0 +isEntityDeclared,0 +isEntityUnparsed,0 diff --git a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_hierarchical.csv b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_hierarchical.csv new file mode 100644 index 0000000..6066f0d --- /dev/null +++ b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_hierarchical.csv @@ -0,0 +1,92 @@ +,cluster +getGrammarDescription,4 +getElementDeclIsExternal,4 +getAttributeDeclIsExternal,4 +getAttributeDeclIndex,0 +startDTD,4 +startParameterEntity,15 +startExternalSubset,4 +endParameterEntity,4 +endExternalSubset,4 +elementDecl,14 +attributeDecl,13 +internalEntityDecl,4 +externalEntityDecl,4 +unparsedEntityDecl,4 +notationDecl,4 +endDTD,4 +setDTDSource,4 +getDTDSource,4 +textDecl,4 +comment,4 +processingInstruction,4 +startAttlist,4 +endAttlist,4 +startConditional,4 +ignoredCharacters,4 +endConditional,4 +setDTDContentModelSource,4 +getDTDContentModelSource,4 +startContentModel,4 +startGroup,4 +pcdata,4 +element,2 +separator,5 +occurrence,5 +endGroup,4 +any,4 +empty,4 +endContentModel,4 +isNamespaceAware,4 +getSymbolTable,4 +getFirstElementDeclIndex,4 +getNextElementDeclIndex,4 +getElementDeclIndex,4 +getContentSpecType,4 +getElementDecl,1 +getElementDeclName,4 +getFirstAttributeDeclIndex,4 +getNextAttributeDeclIndex,4 +getAttributeDecl,4 +isCDATAAttribute,0 +getEntityDeclIndex,4 +getEntityDecl,4 +getNotationDeclIndex,4 +getNotationDecl,4 +getContentSpec,4 +getContentSpecIndex,4 +getContentSpecAsString,2 +printElements,4 +printAttributes,4 +addContentSpecToElement,11 +getElementContentModelValidator,1 +createElementDecl,4 +setElementDecl,4 +putElementNameMapping,4 +setFirstAttributeDeclIndex,4 +setContentSpecIndex,4 +createAttributeDecl,8 +setAttributeDecl,4 +createContentSpec,4 +setContentSpec,4 +createEntityDecl,4 +setEntityDecl,4 +createNotationDecl,4 +setNotationDecl,4 +addContentSpecNode,4 +addUniqueLeafNode,2 +initializeContentModelStack,10 +isImmutable,4 +appendContentSpec,2 +printAttribute,4 +createChildModel,2 +buildSyntaxTree,2 +contentSpecTree,2 +ensureElementDeclCapacity,7 +ensureAttributeDeclCapacity,9 +ensureEntityDeclCapacity,12 +ensureNotationDeclCapacity,3 +ensureContentSpecCapacity,6 +resize,4 +isEntityDeclared,4 +isEntityUnparsed,4 diff --git a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_kmeans.csv b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_kmeans.csv new file mode 100644 index 0000000..e7b5e04 --- /dev/null +++ b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_kmeans.csv @@ -0,0 +1,92 @@ +,cluster +getGrammarDescription,1 +getElementDeclIsExternal,1 +getAttributeDeclIsExternal,1 +getAttributeDeclIndex,15 +startDTD,1 +startParameterEntity,12 +startExternalSubset,1 +endParameterEntity,1 +endExternalSubset,1 +elementDecl,9 +attributeDecl,13 +internalEntityDecl,1 +externalEntityDecl,1 +unparsedEntityDecl,1 +notationDecl,1 +endDTD,1 +setDTDSource,1 +getDTDSource,1 +textDecl,1 +comment,1 +processingInstruction,1 +startAttlist,1 +endAttlist,1 +startConditional,1 +ignoredCharacters,1 +endConditional,1 +setDTDContentModelSource,1 +getDTDContentModelSource,1 +startContentModel,1 +startGroup,1 +pcdata,1 +element,3 +separator,8 +occurrence,8 +endGroup,1 +any,1 +empty,1 +endContentModel,1 +isNamespaceAware,1 +getSymbolTable,1 +getFirstElementDeclIndex,1 +getNextElementDeclIndex,1 +getElementDeclIndex,1 +getContentSpecType,1 +getElementDecl,0 +getElementDeclName,1 +getFirstAttributeDeclIndex,1 +getNextAttributeDeclIndex,1 +getAttributeDecl,1 +isCDATAAttribute,7 +getEntityDeclIndex,1 +getEntityDecl,1 +getNotationDeclIndex,1 +getNotationDecl,1 +getContentSpec,1 +getContentSpecIndex,1 +getContentSpecAsString,3 +printElements,1 +printAttributes,1 +addContentSpecToElement,14 +getElementContentModelValidator,0 +createElementDecl,1 +setElementDecl,1 +putElementNameMapping,1 +setFirstAttributeDeclIndex,1 +setContentSpecIndex,1 +createAttributeDecl,7 +setAttributeDecl,1 +createContentSpec,1 +setContentSpec,1 +createEntityDecl,1 +setEntityDecl,1 +createNotationDecl,1 +setNotationDecl,1 +addContentSpecNode,1 +addUniqueLeafNode,3 +initializeContentModelStack,11 +isImmutable,1 +appendContentSpec,3 +printAttribute,1 +createChildModel,3 +buildSyntaxTree,3 +contentSpecTree,3 +ensureElementDeclCapacity,6 +ensureAttributeDeclCapacity,2 +ensureEntityDeclCapacity,4 +ensureNotationDeclCapacity,5 +ensureContentSpecCapacity,10 +resize,1 +isEntityDeclared,1 +isEntityUnparsed,1 diff --git a/clustering/org.apache.xerces.impl.dtd.DTDGrammar_silhouette.csv b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_silhouette.csv new file mode 100644 index 0000000..f05d176 --- /dev/null +++ b/clustering/org.apache.xerces.impl.dtd.DTDGrammar_silhouette.csv @@ -0,0 +1,16 @@ +,k_means,hierarchical +2,0.7973480585031026,0.5874087027846128 +3,0.7929515822141272,0.6083207410570212 +4,0.7875327826881011,0.692671697230321 +5,0.7432807419504763,0.7141629304171452 +6,0.7211961130227403,0.7297457578156361 +7,0.7108188520737106,0.7538651674386223 +8,0.7521496486020739,0.7823024861034127 +9,0.7975294826166544,0.8082189521577593 +10,0.8045822208703368,0.8104671295723321 +11,0.8091345582763917,0.826856236491233 +12,0.8140357581974259,0.834060274351163 +13,0.8468647347587057,0.8414782883217177 +14,0.8416685816849977,0.8402352442946155 +15,0.8494389858738608,0.8512242552836264 +16,0.8571428571428571,0.8571428571428571 diff --git a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_groundtruth.csv b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_groundtruth.csv new file mode 100644 index 0000000..cf530fd --- /dev/null +++ b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_groundtruth.csv @@ -0,0 +1,107 @@ +,cluster +null2EmptyString,0 +emptyString2Null,0 +doc2SystemId,0 +parseSchema,0 +validateAnnotations,0 +createAnnotationValidator,1 +getGrammar,0 +findGrammar,0 +constructTrees,0 +isExistingGrammar,0 +updateImportListFor,0 +updateImportListWith,0 +buildGlobalNameRegistries,13 +traverseSchemas,0 +needReportTNSError,9 +addGlobalAttributeDecl,13 +addGlobalAttributeGroupDecl,13 +addGlobalElementDecl,13 +addGlobalGroupDecl,13 +addGlobalNotationDecl,13 +addGlobalTypeDecl,13 +addIDConstraintDecl,0 +getGlobalAttributeDecl,13 +getGlobalAttributeGroupDecl,13 +getGlobalElementDecl,13 +getGlobalGroupDecl,13 +getGlobalNotationDecl,13 +getGlobalTypeDecl,13 +getIDConstraintDecl,0 +getGlobalDecl,13 +getGlobalDeclFromGrammar,13 +traverseGlobalDecl,13 +schemaDocument2SystemId,0 +getGrpOrAttrGrpRedefinedByRestriction,0 +resolveKeyRefs,0 +getIDRegistry,0 +getIDRegistry_sub,0 +storeKeyRef,0 +resolveSchema,0 +resolveSchemaSource,0 +getSchemaDocument,0 +getSchemaDocument0,0 +getSchemaDocument1,0 +expandGrammars,0 +existingGrammars,0 +canAddComponents,14 +canAddComponent,14 +addGrammars,0 +addGrammarComponents,14 +createGrammarFrom,1 +addNewGrammarLocations,0 +addNewImportedGrammars,0 +updateImportList,0 +addNewGrammarComponents,14 +addGlobalElementDecls,13 +addGlobalAttributeDecls,13 +addGlobalAttributeGroupDecls,13 +addGlobalNotationDecls,13 +addGlobalGroupDecls,13 +addGlobalTypeDecls,13 +expandComponents,14 +expandRelatedComponents,14 +expandRelatedAttributeComponents,14 +expandRelatedElementComponents,14 +expandRelatedTypeComponents,14 +expandRelatedModelGroupDefinitionComponents,14 +expandRelatedAttributeGroupComponents,14 +expandRelatedComplexTypeComponents,14 +expandRelatedSimpleTypeComponents,14 +expandRelatedAttributeUsesComponents,14 +expandRelatedAttributeUseComponents,14 +expandRelatedParticleComponents,14 +expandRelatedModelGroupComponents,14 +addRelatedType,0 +addRelatedElement,0 +addRelatedAttribute,0 +addGlobalComponents,13 +addGlobalComponent,13 +updateImportDependencies,0 +expandImportList,0 +addImportList,0 +containedImportedGrammar,0 +getSchemaGrammar,0 +findDependentNamespaces,0 +addNamespaceDependency,0 +reportSharingError,9 +createTraversers,1 +prepareForParse,0 +prepareForTraverse,0 +setDeclPool,0 +setDVFactory,0 +reset,0 +traverseLocalElements,0 +removeParticle,0 +fillInLocalElemInfo,0 +checkForDuplicateNames,0 +renameRedefiningComponents,14 +findQName,0 +changeRedefineGroup,0 +findXSDocumentForDecl,0 +nonAnnotationContent,10 +setSchemasVisible,0 +element2Locator,0 +reportSchemaError,9 +reportSchemaWarning,0 +setGenerateSyntheticAnnotations,0 diff --git a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_hierarchical.csv b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_hierarchical.csv new file mode 100644 index 0000000..4a91877 --- /dev/null +++ b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_hierarchical.csv @@ -0,0 +1,107 @@ +,cluster +null2EmptyString,6 +emptyString2Null,6 +doc2SystemId,3 +parseSchema,7 +validateAnnotations,3 +createAnnotationValidator,3 +getGrammar,3 +findGrammar,3 +constructTrees,2 +isExistingGrammar,3 +updateImportListFor,3 +updateImportListWith,3 +buildGlobalNameRegistries,0 +traverseSchemas,0 +needReportTNSError,3 +addGlobalAttributeDecl,3 +addGlobalAttributeGroupDecl,3 +addGlobalElementDecl,3 +addGlobalGroupDecl,3 +addGlobalNotationDecl,3 +addGlobalTypeDecl,3 +addIDConstraintDecl,3 +getGlobalAttributeDecl,3 +getGlobalAttributeGroupDecl,3 +getGlobalElementDecl,3 +getGlobalGroupDecl,3 +getGlobalNotationDecl,3 +getGlobalTypeDecl,3 +getIDConstraintDecl,3 +getGlobalDecl,3 +getGlobalDeclFromGrammar,3 +traverseGlobalDecl,0 +schemaDocument2SystemId,3 +getGrpOrAttrGrpRedefinedByRestriction,6 +resolveKeyRefs,3 +getIDRegistry,3 +getIDRegistry_sub,3 +storeKeyRef,0 +resolveSchema,3 +resolveSchemaSource,3 +getSchemaDocument,12 +getSchemaDocument0,3 +getSchemaDocument1,3 +expandGrammars,3 +existingGrammars,3 +canAddComponents,3 +canAddComponent,5 +addGrammars,3 +addGrammarComponents,3 +createGrammarFrom,3 +addNewGrammarLocations,3 +addNewImportedGrammars,3 +updateImportList,3 +addNewGrammarComponents,3 +addGlobalElementDecls,5 +addGlobalAttributeDecls,5 +addGlobalAttributeGroupDecls,5 +addGlobalNotationDecls,5 +addGlobalGroupDecls,5 +addGlobalTypeDecls,5 +expandComponents,3 +expandRelatedComponents,5 +expandRelatedAttributeComponents,3 +expandRelatedElementComponents,3 +expandRelatedTypeComponents,3 +expandRelatedModelGroupDefinitionComponents,3 +expandRelatedAttributeGroupComponents,3 +expandRelatedComplexTypeComponents,3 +expandRelatedSimpleTypeComponents,3 +expandRelatedAttributeUsesComponents,3 +expandRelatedAttributeUseComponents,3 +expandRelatedParticleComponents,5 +expandRelatedModelGroupComponents,3 +addRelatedType,3 +addRelatedElement,5 +addRelatedAttribute,5 +addGlobalComponents,3 +addGlobalComponent,9 +updateImportDependencies,3 +expandImportList,3 +addImportList,3 +containedImportedGrammar,3 +getSchemaGrammar,3 +findDependentNamespaces,3 +addNamespaceDependency,3 +reportSharingError,3 +createTraversers,3 +prepareForParse,3 +prepareForTraverse,3 +setDeclPool,3 +setDVFactory,3 +reset,3 +traverseLocalElements,10 +removeParticle,4 +fillInLocalElemInfo,8 +checkForDuplicateNames,3 +renameRedefiningComponents,2 +findQName,6 +changeRedefineGroup,11 +findXSDocumentForDecl,3 +nonAnnotationContent,3 +setSchemasVisible,3 +element2Locator,3 +reportSchemaError,1 +reportSchemaWarning,1 +setGenerateSyntheticAnnotations,3 diff --git a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_kmeans.csv b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_kmeans.csv new file mode 100644 index 0000000..83617ea --- /dev/null +++ b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_kmeans.csv @@ -0,0 +1,107 @@ +,cluster +null2EmptyString,8 +emptyString2Null,8 +doc2SystemId,0 +parseSchema,5 +validateAnnotations,0 +createAnnotationValidator,0 +getGrammar,0 +findGrammar,0 +constructTrees,3 +isExistingGrammar,0 +updateImportListFor,0 +updateImportListWith,0 +buildGlobalNameRegistries,2 +traverseSchemas,2 +needReportTNSError,0 +addGlobalAttributeDecl,0 +addGlobalAttributeGroupDecl,0 +addGlobalElementDecl,0 +addGlobalGroupDecl,0 +addGlobalNotationDecl,0 +addGlobalTypeDecl,0 +addIDConstraintDecl,0 +getGlobalAttributeDecl,0 +getGlobalAttributeGroupDecl,0 +getGlobalElementDecl,0 +getGlobalGroupDecl,0 +getGlobalNotationDecl,0 +getGlobalTypeDecl,0 +getIDConstraintDecl,0 +getGlobalDecl,0 +getGlobalDeclFromGrammar,0 +traverseGlobalDecl,2 +schemaDocument2SystemId,0 +getGrpOrAttrGrpRedefinedByRestriction,8 +resolveKeyRefs,0 +getIDRegistry,0 +getIDRegistry_sub,0 +storeKeyRef,9 +resolveSchema,0 +resolveSchemaSource,0 +getSchemaDocument,0 +getSchemaDocument0,0 +getSchemaDocument1,0 +expandGrammars,0 +existingGrammars,0 +canAddComponents,0 +canAddComponent,1 +addGrammars,0 +addGrammarComponents,0 +createGrammarFrom,0 +addNewGrammarLocations,0 +addNewImportedGrammars,0 +updateImportList,0 +addNewGrammarComponents,0 +addGlobalElementDecls,1 +addGlobalAttributeDecls,1 +addGlobalAttributeGroupDecls,1 +addGlobalNotationDecls,1 +addGlobalGroupDecls,1 +addGlobalTypeDecls,1 +expandComponents,0 +expandRelatedComponents,1 +expandRelatedAttributeComponents,0 +expandRelatedElementComponents,0 +expandRelatedTypeComponents,0 +expandRelatedModelGroupDefinitionComponents,0 +expandRelatedAttributeGroupComponents,0 +expandRelatedComplexTypeComponents,0 +expandRelatedSimpleTypeComponents,0 +expandRelatedAttributeUsesComponents,0 +expandRelatedAttributeUseComponents,0 +expandRelatedParticleComponents,1 +expandRelatedModelGroupComponents,0 +addRelatedType,0 +addRelatedElement,1 +addRelatedAttribute,1 +addGlobalComponents,0 +addGlobalComponent,6 +updateImportDependencies,0 +expandImportList,0 +addImportList,0 +containedImportedGrammar,0 +getSchemaGrammar,0 +findDependentNamespaces,0 +addNamespaceDependency,0 +reportSharingError,0 +createTraversers,0 +prepareForParse,0 +prepareForTraverse,0 +setDeclPool,0 +setDVFactory,0 +reset,0 +traverseLocalElements,7 +removeParticle,10 +fillInLocalElemInfo,12 +checkForDuplicateNames,0 +renameRedefiningComponents,13 +findQName,8 +changeRedefineGroup,11 +findXSDocumentForDecl,0 +nonAnnotationContent,0 +setSchemasVisible,0 +element2Locator,0 +reportSchemaError,4 +reportSchemaWarning,4 +setGenerateSyntheticAnnotations,0 diff --git a/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_silhouette.csv b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_silhouette.csv new file mode 100644 index 0000000..4abbf16 --- /dev/null +++ b/clustering/org.apache.xerces.impl.xs.traversers.XSDHandler_silhouette.csv @@ -0,0 +1,14 @@ +,k_means,hierarchical +2,0.653710343152024,0.6897290995261953 +3,0.7619679813132313,0.739093871180792 +4,0.7984312146893348,0.8064687623601431 +5,0.8435547562272465,0.8435547562272465 +6,0.8611535046317008,0.842649115950788 +7,0.8678669375642456,0.8504397238719222 +8,0.8739125874487823,0.8174948896615486 +9,0.8788481792716016,0.8747912741464551 +10,0.8903718217266576,0.8775840532668583 +11,0.8875387867792598,0.8867064376197864 +12,0.8959671318660039,0.8952551515692575 +13,0.902637950556443,0.9017527022417632 +14,0.9084235209322024,0.8989895586680513 diff --git a/clustering/org.apache.xerces.xinclude.XIncludeHandler_groundtruth.csv b/clustering/org.apache.xerces.xinclude.XIncludeHandler_groundtruth.csv new file mode 100644 index 0000000..b0956f3 --- /dev/null +++ b/clustering/org.apache.xerces.xinclude.XIncludeHandler_groundtruth.csv @@ -0,0 +1,109 @@ +,cluster +reset,0 +getRecognizedFeatures,0 +setFeature,0 +getRecognizedProperties,0 +setProperty,0 +getFeatureDefault,0 +getPropertyDefault,0 +setDocumentHandler,0 +getDocumentHandler,0 +startDocument,0 +xmlDecl,0 +doctypeDecl,0 +comment,0 +processingInstruction,0 +startElement,0 +emptyElement,0 +endElement,0 +startGeneralEntity,0 +textDecl,0 +endGeneralEntity,0 +characters,0 +ignorableWhitespace,0 +startCDATA,0 +endCDATA,0 +endDocument,0 +setDocumentSource,0 +getDocumentSource,0 +attributeDecl,0 +elementDecl,0 +endAttlist,0 +endConditional,0 +endDTD,0 +endExternalSubset,12 +endParameterEntity,11 +externalEntityDecl,0 +getDTDSource,0 +ignoredCharacters,0 +internalEntityDecl,0 +notationDecl,0 +setDTDSource,0 +startAttlist,0 +startConditional,0 +startDTD,0 +startExternalSubset,12 +startParameterEntity,11 +unparsedEntityDecl,0 +getDTDHandler,0 +setDTDHandler,0 +setErrorReporter,9 +handleFallbackElement,0 +handleIncludeElement,0 +hasXIncludeNamespace,0 +isIncludeElement,0 +isFallbackElement,0 +sameBaseURIAsIncludeParent,4 +sameLanguageAsIncludeParent,0 +setupCurrentBaseURI,4 +searchForRecursiveIncludes,0 +isTopLevelIncludedItem,0 +isTopLevelIncludedItemViaInclude,0 +isTopLevelIncludedItemViaFallback,0 +processAttributes,0 +getRelativeBaseURI,4 +getIncludeParentBaseURI,4 +getIncludeParentLanguage,0 +getIncludeParentDepth,0 +getResultDepth,0 +modifyAugmentations,0 +getState,0 +setState,0 +setSawFallback,0 +getSawFallback,0 +setSawInclude,0 +getSawInclude,0 +reportResourceError,9 +reportFatalError,9 +reportError,9 +setParent,0 +setHref,0 +setXIncludeLocator,0 +isRootDocument,0 +addUnparsedEntity,0 +addNotation,0 +checkUnparsedEntity,0 +checkNotation,0 +checkAndSendUnparsedEntity,0 +checkAndSendNotation,0 +checkWhitespace,0 +checkMultipleRootElements,0 +setRootElementProcessed,0 +getRootElementProcessed,0 +copyFeatures,0 +copyFeatures1,0 +saveBaseURI,4 +restoreBaseURI,4 +saveLanguage,0 +restoreLanguage,0 +getBaseURI,4 +getLanguage,0 +getRelativeURI,4 +scopeOfBaseURI,4 +scopeOfLanguage,0 +processXMLBaseAttributes,0 +processXMLLangAttributes,0 +isValidInHTTPHeader,0 +createInputSource,1 +isEqual,0 +escapeHref,0 diff --git a/clustering/org.apache.xerces.xinclude.XIncludeHandler_hierarchical.csv b/clustering/org.apache.xerces.xinclude.XIncludeHandler_hierarchical.csv new file mode 100644 index 0000000..0dfd896 --- /dev/null +++ b/clustering/org.apache.xerces.xinclude.XIncludeHandler_hierarchical.csv @@ -0,0 +1,109 @@ +,cluster +reset,13 +getRecognizedFeatures,0 +setFeature,0 +getRecognizedProperties,0 +setProperty,0 +getFeatureDefault,12 +getPropertyDefault,8 +setDocumentHandler,0 +getDocumentHandler,0 +startDocument,9 +xmlDecl,0 +doctypeDecl,0 +comment,0 +processingInstruction,0 +startElement,0 +emptyElement,0 +endElement,0 +startGeneralEntity,7 +textDecl,0 +endGeneralEntity,0 +characters,0 +ignorableWhitespace,0 +startCDATA,0 +endCDATA,0 +endDocument,0 +setDocumentSource,0 +getDocumentSource,0 +attributeDecl,0 +elementDecl,0 +endAttlist,0 +endConditional,0 +endDTD,0 +endExternalSubset,0 +endParameterEntity,0 +externalEntityDecl,0 +getDTDSource,0 +ignoredCharacters,0 +internalEntityDecl,0 +notationDecl,0 +setDTDSource,0 +startAttlist,0 +startConditional,0 +startDTD,0 +startExternalSubset,0 +startParameterEntity,0 +unparsedEntityDecl,0 +getDTDHandler,0 +setDTDHandler,0 +setErrorReporter,3 +handleFallbackElement,0 +handleIncludeElement,11 +hasXIncludeNamespace,0 +isIncludeElement,0 +isFallbackElement,0 +sameBaseURIAsIncludeParent,0 +sameLanguageAsIncludeParent,0 +setupCurrentBaseURI,0 +searchForRecursiveIncludes,0 +isTopLevelIncludedItem,0 +isTopLevelIncludedItemViaInclude,0 +isTopLevelIncludedItemViaFallback,0 +processAttributes,14 +getRelativeBaseURI,0 +getIncludeParentBaseURI,0 +getIncludeParentLanguage,0 +getIncludeParentDepth,0 +getResultDepth,0 +modifyAugmentations,5 +getState,0 +setState,0 +setSawFallback,10 +getSawFallback,10 +setSawInclude,4 +getSawInclude,4 +reportResourceError,1 +reportFatalError,1 +reportError,3 +setParent,0 +setHref,0 +setXIncludeLocator,0 +isRootDocument,0 +addUnparsedEntity,0 +addNotation,0 +checkUnparsedEntity,0 +checkNotation,0 +checkAndSendUnparsedEntity,0 +checkAndSendNotation,0 +checkWhitespace,0 +checkMultipleRootElements,0 +setRootElementProcessed,0 +getRootElementProcessed,0 +copyFeatures,7 +copyFeatures1,0 +saveBaseURI,0 +restoreBaseURI,0 +saveLanguage,0 +restoreLanguage,0 +getBaseURI,0 +getLanguage,0 +getRelativeURI,0 +scopeOfBaseURI,0 +scopeOfLanguage,0 +processXMLBaseAttributes,2 +processXMLLangAttributes,2 +isValidInHTTPHeader,0 +createInputSource,6 +isEqual,0 +escapeHref,0 diff --git a/clustering/org.apache.xerces.xinclude.XIncludeHandler_kmeans.csv b/clustering/org.apache.xerces.xinclude.XIncludeHandler_kmeans.csv new file mode 100644 index 0000000..5e3322f --- /dev/null +++ b/clustering/org.apache.xerces.xinclude.XIncludeHandler_kmeans.csv @@ -0,0 +1,109 @@ +,cluster +reset,2 +getRecognizedFeatures,0 +setFeature,0 +getRecognizedProperties,0 +setProperty,0 +getFeatureDefault,0 +getPropertyDefault,13 +setDocumentHandler,0 +getDocumentHandler,0 +startDocument,9 +xmlDecl,0 +doctypeDecl,0 +comment,0 +processingInstruction,0 +startElement,0 +emptyElement,0 +endElement,0 +startGeneralEntity,1 +textDecl,0 +endGeneralEntity,0 +characters,0 +ignorableWhitespace,0 +startCDATA,0 +endCDATA,0 +endDocument,0 +setDocumentSource,0 +getDocumentSource,0 +attributeDecl,0 +elementDecl,0 +endAttlist,0 +endConditional,0 +endDTD,0 +endExternalSubset,0 +endParameterEntity,0 +externalEntityDecl,0 +getDTDSource,0 +ignoredCharacters,0 +internalEntityDecl,0 +notationDecl,0 +setDTDSource,0 +startAttlist,0 +startConditional,0 +startDTD,0 +startExternalSubset,0 +startParameterEntity,0 +unparsedEntityDecl,0 +getDTDHandler,0 +setDTDHandler,0 +setErrorReporter,4 +handleFallbackElement,0 +handleIncludeElement,3 +hasXIncludeNamespace,0 +isIncludeElement,0 +isFallbackElement,0 +sameBaseURIAsIncludeParent,0 +sameLanguageAsIncludeParent,0 +setupCurrentBaseURI,0 +searchForRecursiveIncludes,0 +isTopLevelIncludedItem,0 +isTopLevelIncludedItemViaInclude,0 +isTopLevelIncludedItemViaFallback,0 +processAttributes,12 +getRelativeBaseURI,0 +getIncludeParentBaseURI,0 +getIncludeParentLanguage,0 +getIncludeParentDepth,0 +getResultDepth,0 +modifyAugmentations,11 +getState,0 +setState,14 +setSawFallback,8 +getSawFallback,8 +setSawInclude,7 +getSawInclude,7 +reportResourceError,6 +reportFatalError,6 +reportError,4 +setParent,0 +setHref,0 +setXIncludeLocator,0 +isRootDocument,0 +addUnparsedEntity,0 +addNotation,0 +checkUnparsedEntity,0 +checkNotation,0 +checkAndSendUnparsedEntity,0 +checkAndSendNotation,0 +checkWhitespace,0 +checkMultipleRootElements,0 +setRootElementProcessed,0 +getRootElementProcessed,0 +copyFeatures,1 +copyFeatures1,0 +saveBaseURI,0 +restoreBaseURI,0 +saveLanguage,0 +restoreLanguage,0 +getBaseURI,0 +getLanguage,0 +getRelativeURI,0 +scopeOfBaseURI,0 +scopeOfLanguage,0 +processXMLBaseAttributes,5 +processXMLLangAttributes,5 +isValidInHTTPHeader,0 +createInputSource,10 +isEqual,0 +escapeHref,0 diff --git a/clustering/org.apache.xerces.xinclude.XIncludeHandler_silhouette.csv b/clustering/org.apache.xerces.xinclude.XIncludeHandler_silhouette.csv new file mode 100644 index 0000000..eb9a459 --- /dev/null +++ b/clustering/org.apache.xerces.xinclude.XIncludeHandler_silhouette.csv @@ -0,0 +1,15 @@ +,k_means,hierarchical +2,0.7400737900068505,0.7450255723178434 +3,0.750834341421787,0.7590675173051686 +4,0.7679751766235376,0.7537209967731088 +5,0.7694034011467328,0.7857193607872571 +6,0.7912800923302977,0.7998705023701569 +7,0.7945459689471787,0.8089785658259816 +8,0.8182179715203751,0.8386247544828634 +9,0.8417229111627736,0.8681233694906747 +10,0.8647266620127783,0.879025772110854 +11,0.8825387834587034,0.8849687834612566 +12,0.8852199398177911,0.891045019202272 +13,0.8905822525359668,0.8974662016344479 +14,0.9036154755382922,0.9037208985767615 +15,0.9101193816556802,0.9101193816556802 diff --git a/ground_truth.py b/ground_truth.py new file mode 100755 index 0000000..f23dbf7 --- /dev/null +++ b/ground_truth.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +import os +import pandas as pd +import glob + + +DIR: str = os.path.dirname(os.path.realpath(__file__)) +IN_DIR: str = DIR + '/feature_vectors' +OUT_DIR: str = DIR + '/clustering' + + +def clean_output(): + filelist = glob.glob(OUT_DIR + '/*_groundtruth.csv') + for f in filelist: + os.remove(f) + + +def ground_truth(method_name: str, keywords: list[str]): + for i, key in enumerate(keywords): + if method_name.find(key) != -1: + return i + 1 + + return 0 + + +def create_ground_truth(path: str, keywords: list[str]): + clazz_name = os.path.basename(path) + clazz_name = clazz_name[:clazz_name.rfind('.')] + + df = pd.read_csv(path, index_col=0).filter([]) + df['cluster'] = df.index.map(lambda m: ground_truth(m.lower(), keywords)) + df.to_csv(OUT_DIR + '/' + clazz_name + '_groundtruth.csv') + + +def main(): + with open(DIR + '/keyword_list.txt', 'r') as f: + keywords: list[str] = [x.strip().strip('\n').lower() for x in f.readlines()] + + clean_output() + + filelist = glob.glob(IN_DIR + '/*.csv') + for f in filelist: + create_ground_truth(f, keywords) + + +if __name__ == '__main__': + main() diff --git a/hierarchical.py b/hierarchical.py new file mode 100755 index 0000000..e2e1194 --- /dev/null +++ b/hierarchical.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +from sklearn.cluster import AgglomerativeClustering +import numpy as np +import glob +import os +import pandas as pd +import argparse + +DIR: str = os.path.dirname(os.path.realpath(__file__)) +OUT_DIR: str = DIR + '/clustering' +IN_DIR: str = DIR + '/feature_vectors' + + +def cluster_hierarchical(path: str, n_clusters: int, save_to_disk: bool = True) -> tuple[any, any]: + clazz_name = os.path.basename(path) + clazz_name = clazz_name[:clazz_name.rfind('.')] + + df = pd.read_csv(path) + X = df.drop(df.columns[0], axis=1).to_numpy() + kmeans = AgglomerativeClustering( + n_clusters=n_clusters, linkage='complete').fit(X) + + Y = kmeans.labels_ # array of cluster # assigned to each method + + # combine cluster labels with method name + assigned = pd.DataFrame(Y, columns=['cluster']).set_axis( + df.iloc[:, 0].values) + + if save_to_disk: + assigned.to_csv(OUT_DIR + '/' + clazz_name + '_hierarchical.csv') + + return (X, Y,) + + +def main(): + parser = argparse.ArgumentParser( + description='Compute agglomerative clustering') + parser.add_argument('class_name', type=str, help='name of the god class') + parser.add_argument('n_clusters', type=int, help='number of clusters') + + args = parser.parse_args() + path = IN_DIR + '/' + args.class_name + '.csv' + + os.remove(OUT_DIR + '/' + args.class_name + '_hierarchical.csv') + cluster_hierarchical(path, args.n_clusters) + + +if __name__ == '__main__': + main() diff --git a/k_means.py b/k_means.py new file mode 100755 index 0000000..838b5f5 --- /dev/null +++ b/k_means.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +from sklearn.cluster import KMeans +import numpy as np +import glob +import os +import pandas as pd +import argparse + +DIR: str = os.path.dirname(os.path.realpath(__file__)) +OUT_DIR: str = DIR + '/clustering' +IN_DIR: str = DIR + '/feature_vectors' + +RAND_SEED: int = 0 + + +def cluster_kmeans(path: str, n_clusters: int, save_to_disk: bool = True) -> tuple[any, any]: + clazz_name = os.path.basename(path) + clazz_name = clazz_name[:clazz_name.rfind('.')] + + df = pd.read_csv(path, index_col=0) + X = df.to_numpy() + + kmeans = KMeans(n_clusters=n_clusters, + random_state=RAND_SEED, n_init='auto').fit(X) + + Y = kmeans.labels_ # array of cluster # assigned to each method + + # combine cluster labels with method name + assigned = pd.DataFrame(Y, columns=['cluster']).set_axis( + df.index.values) + + if save_to_disk: + assigned.to_csv(OUT_DIR + '/' + clazz_name + '_kmeans.csv') + + return (X, Y,) + + +def main(): + parser = argparse.ArgumentParser( + description='Compute k-means clustering') + parser.add_argument('class_name', type=str, help='name of the god class') + parser.add_argument('n_clusters', type=int, help='number of clusters') + + args = parser.parse_args() + path = IN_DIR + '/' + args.class_name + '.csv' + + os.remove(OUT_DIR + '/' + args.class_name + '_kmeans.csv') + cluster_kmeans(path, args.n_clusters) + + +if __name__ == '__main__': + main() diff --git a/keyword_list.txt b/keyword_list.txt new file mode 100644 index 0000000..7a7cc24 --- /dev/null +++ b/keyword_list.txt @@ -0,0 +1,14 @@ +create +object +cache +uri +standalone +encoding +identifier +user +error +content +parameter +subset +global +component \ No newline at end of file diff --git a/prec_recall.py b/prec_recall.py new file mode 100755 index 0000000..f81c325 --- /dev/null +++ b/prec_recall.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +import numpy as np +import glob +import os +import pandas as pd + +DIR: str = os.path.dirname(os.path.realpath(__file__)) +IN_DIR: str = DIR + '/clustering' +OUT_DIR: str = DIR + '' + + +def intrapairs(path: str) -> set[set[str, str]]: + df = pd.read_csv(path) + clusters: list[list[str]] = df.groupby( + 'cluster').agg(list).iloc[:, 0].values + + intrapairs: set[set[str]] = set() # inner sets always contain 2 elements + for cluster in clusters: + for i, e1 in enumerate(cluster): + for j in range(i + 1, len(cluster)): + e2 = cluster[j] + intrapairs.add(frozenset((e1, e2,))) + return intrapairs + + +def main(): + filelist = glob.glob(IN_DIR + '/*_groundtruth.csv') + for f in filelist: + clazz_name = os.path.basename(f) + clazz_name = clazz_name[:clazz_name.rfind('_groundtruth.csv')] + print(clazz_name) + + ground_pairs = intrapairs(f) + for method in ['kmeans', 'hierarchical']: + cluster_pairs = intrapairs( + IN_DIR + '/' + clazz_name + '_' + method + '.csv') + + n_common = len(ground_pairs.intersection(cluster_pairs)) + precision = n_common / len(cluster_pairs) + recall = n_common / len(ground_pairs) + + print(method + " precision: " + str(precision)) + print(method + " recall: " + str(recall)) + + print() + + +if __name__ == '__main__': + main() diff --git a/readme.md b/readme.md index 1455353..7bcccd6 100644 --- a/readme.md +++ b/readme.md @@ -21,7 +21,7 @@ source env/bin/activate pip3 install -r requirements.txt ``` -## Run *find god classes* +## Running part 1: find god classes ```shell ./find_god_classes.py @@ -29,6 +29,32 @@ pip3 install -r requirements.txt The resulting CSV file containing a list of God classes is generated in the `god_classes/god_classes.csv` path. +## Running part 3: clustering and silhouette metric + +To compute optimal k-means and agglomerative clusterings using silhouette +validation for all classes run: + +```shell +./silhouette.py --validate --autorun +``` + +To compute k-means or agglomerative clustering for a specific number of +clusters `K` and a specific class `KLASS` run respectively: + +```shell +./k_means.py KLASS K +``` + +```shell +./hierarchical.py KLASS K +``` + +Then, to check their silhouette metric run: + +```shell +./silhouette.py +``` + ## Compile report - Install [Pandoc](https://pandoc.org/); diff --git a/requirements.txt b/requirements.txt index 87736ac..bbd5f42 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ javalang==0.13.0 +numpy==1.23.5 pandas==1.5.2 -scikit_learn==1.2.1 -shrek==0.0.2 +scikit_learn==1.2.2 diff --git a/silhouette.py b/silhouette.py new file mode 100755 index 0000000..1aaa978 --- /dev/null +++ b/silhouette.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +from sklearn.cluster import AgglomerativeClustering +from sklearn.metrics import silhouette_score +import numpy as np +import glob +import os +import pandas as pd +import argparse +from k_means import cluster_kmeans +from hierarchical import cluster_hierarchical + +DIR: str = os.path.dirname(os.path.realpath(__file__)) +OUT_DIR: str = DIR + '/clustering' +IN_DIR: str = DIR + '/feature_vectors' + +K_MAX: int = 65 + + +def clean_output(): + filelist = glob.glob(OUT_DIR + '/*_silhouette.csv') + for f in filelist: + os.remove(f) + + +def validate(path: str, clazz_name: str, autorun: bool): + df = pd.DataFrame(columns=['k_means', 'hierarchical'], dtype=float) + + # We bound the number of clusters by the number of distinct points in our dataset. + # To count them, we compute the number of "distinct" feature vectors and we + # bound to the minimum of K_MAX and this number. + nodup = pd.read_csv(path, index_col=0).drop_duplicates() + max_distinct = len(nodup) + + for n in range(2, min(K_MAX, max_distinct)): + X_h, Y_h = cluster_hierarchical(path, n, save_to_disk=False) + df.loc[n, 'k_means'] = silhouette_score(X_h, Y_h) + + X_k, Y_k = cluster_kmeans(path, n, save_to_disk=False) + df.loc[n, 'hierarchical'] = silhouette_score(X_k, Y_k) + + k_kmeans = df[['k_means']].idxmax()[0] + k_hierarchical = df[['hierarchical']].idxmax()[0] + + print("K_means optimal value: " + str(k_kmeans)) + print("Hierarchical optimal value: " + str(k_hierarchical)) + + df.to_csv(OUT_DIR + '/' + clazz_name + '_silhouette.csv') + + if autorun: + cluster_hierarchical(path, k_hierarchical) + cluster_kmeans(path, k_kmeans) + + + +def compute_silhouette(path: str, clazz_name: str, suffix: str): + df_y = pd.read_csv(OUT_DIR + '/' + clazz_name + '_' + suffix + '.csv') + Y = df_y.iloc[:, 1].values + + df = pd.read_csv(path) + X = df.drop(df.columns[0], axis=1).to_numpy() + + print("Silhouette for " + suffix + ": " + str(silhouette_score(X, Y))) + + +def main(): + parser = argparse.ArgumentParser(description='Compute silhouette metric.') + parser.add_argument('--validate', action='store_true', + help='compute optimal k for each algorithm') + parser.add_argument('--autorun', action='store_true', + help='if validating, computes CSV for optimal clustering automatically') + + + args = parser.parse_args() + + if args.validate: + clean_output() + + filelist = glob.glob(IN_DIR + '/*.csv') + for f in filelist: + clazz_name = os.path.basename(f) + clazz_name = clazz_name[:clazz_name.rfind('.')] + + print(clazz_name) + + if args.validate: + validate(f, clazz_name, args.autorun) + + compute_silhouette(f, clazz_name, 'kmeans') + compute_silhouette(f, clazz_name, 'hierarchical') + + print() + + +if __name__ == '__main__': + main()