part 2 done but Doc2Vec
This commit is contained in:
parent
ad1d07623c
commit
3196c1c199
2 changed files with 39 additions and 6 deletions
|
@ -1,2 +1,3 @@
|
|||
nltk==3.8.1
|
||||
pandas==2.1.1
|
||||
gensim==4.3.2
|
||||
|
|
|
@ -5,6 +5,10 @@ import pandas as pd
|
|||
import nltk
|
||||
import numpy as np
|
||||
from nltk.corpus import stopwords
|
||||
from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity
|
||||
from gensim.models import TfidfModel, LsiModel, LdaModel
|
||||
from gensim.corpora import Dictionary
|
||||
from collections import defaultdict
|
||||
|
||||
nltk.download('stopwords')
|
||||
|
||||
|
@ -44,20 +48,48 @@ def get_bow(data, split_f):
|
|||
return remove_stopwords(split_f(data))
|
||||
|
||||
|
||||
def search(query):
|
||||
df = pd.read_csv(IN_DATASET)
|
||||
def print_sims(corpus, query, df, dictionary):
|
||||
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
|
||||
sims = index[query]
|
||||
pick_top = 5
|
||||
|
||||
for i, row in df.iterrows():
|
||||
name_bow = get_bow(row["name"], identifier_split)
|
||||
comment_bow = get_bow(row["comment"], comment_split)
|
||||
for idx, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top]:
|
||||
row = df.loc[idx]
|
||||
print("Similarity: {s:2.02f}%".format(s=score*100))
|
||||
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
|
||||
.format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"]))
|
||||
|
||||
|
||||
def search(query, method):
|
||||
df = pd.read_csv(IN_DATASET)
|
||||
df["name_bow"] = df["name"].apply(lambda n: get_bow(n, identifier_split))
|
||||
df["comment_bow"] = df["comment"].apply(lambda c: get_bow(c, comment_split))
|
||||
|
||||
corpus_list = []
|
||||
for idx, row in df.iterrows():
|
||||
document_words = row["name_bow"] + row["comment_bow"]
|
||||
corpus_list.append(document_words)
|
||||
|
||||
dictionary = Dictionary(corpus_list)
|
||||
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
|
||||
query_bow = dictionary.doc2bow(get_bow(query, comment_split))
|
||||
|
||||
if method == "tfidf":
|
||||
tfidf = TfidfModel(corpus_bow)
|
||||
print_sims(tfidf[corpus_bow], tfidf[query_bow], df, dictionary)
|
||||
elif method == "freq":
|
||||
print_sims(corpus_bow, query_bow, df, dictionary)
|
||||
elif method == "lsi":
|
||||
lsi = LsiModel(corpus_bow)
|
||||
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("method", help="the method to compare similarities with", type=str)
|
||||
parser.add_argument("query", help="the query to search the corpus with", type=str)
|
||||
args = parser.parse_args()
|
||||
search(args.query)
|
||||
search(args.query, args.method)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in a new issue