part 2 done but Doc2Vec
This commit is contained in:
parent
ad1d07623c
commit
3196c1c199
2 changed files with 39 additions and 6 deletions
|
@ -1,2 +1,3 @@
|
||||||
nltk==3.8.1
|
nltk==3.8.1
|
||||||
pandas==2.1.1
|
pandas==2.1.1
|
||||||
|
gensim==4.3.2
|
||||||
|
|
|
@ -5,6 +5,10 @@ import pandas as pd
|
||||||
import nltk
|
import nltk
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
|
from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity
|
||||||
|
from gensim.models import TfidfModel, LsiModel, LdaModel
|
||||||
|
from gensim.corpora import Dictionary
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
nltk.download('stopwords')
|
nltk.download('stopwords')
|
||||||
|
|
||||||
|
@ -44,20 +48,48 @@ def get_bow(data, split_f):
|
||||||
return remove_stopwords(split_f(data))
|
return remove_stopwords(split_f(data))
|
||||||
|
|
||||||
|
|
||||||
def search(query):
|
def print_sims(corpus, query, df, dictionary):
|
||||||
df = pd.read_csv(IN_DATASET)
|
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
|
||||||
|
sims = index[query]
|
||||||
|
pick_top = 5
|
||||||
|
|
||||||
for i, row in df.iterrows():
|
for idx, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top]:
|
||||||
name_bow = get_bow(row["name"], identifier_split)
|
row = df.loc[idx]
|
||||||
comment_bow = get_bow(row["comment"], comment_split)
|
print("Similarity: {s:2.02f}%".format(s=score*100))
|
||||||
|
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
|
||||||
|
.format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"]))
|
||||||
|
|
||||||
|
|
||||||
|
def search(query, method):
|
||||||
|
df = pd.read_csv(IN_DATASET)
|
||||||
|
df["name_bow"] = df["name"].apply(lambda n: get_bow(n, identifier_split))
|
||||||
|
df["comment_bow"] = df["comment"].apply(lambda c: get_bow(c, comment_split))
|
||||||
|
|
||||||
|
corpus_list = []
|
||||||
|
for idx, row in df.iterrows():
|
||||||
|
document_words = row["name_bow"] + row["comment_bow"]
|
||||||
|
corpus_list.append(document_words)
|
||||||
|
|
||||||
|
dictionary = Dictionary(corpus_list)
|
||||||
|
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
|
||||||
|
query_bow = dictionary.doc2bow(get_bow(query, comment_split))
|
||||||
|
|
||||||
|
if method == "tfidf":
|
||||||
|
tfidf = TfidfModel(corpus_bow)
|
||||||
|
print_sims(tfidf[corpus_bow], tfidf[query_bow], df, dictionary)
|
||||||
|
elif method == "freq":
|
||||||
|
print_sims(corpus_bow, query_bow, df, dictionary)
|
||||||
|
elif method == "lsi":
|
||||||
|
lsi = LsiModel(corpus_bow)
|
||||||
|
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("method", help="the method to compare similarities with", type=str)
|
||||||
parser.add_argument("query", help="the query to search the corpus with", type=str)
|
parser.add_argument("query", help="the query to search the corpus with", type=str)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
search(args.query)
|
search(args.query, args.method)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in a new issue