So, um, I’ve created a Python implementation of the cosine similarity algorithm. It calculates the cosine similarity between two text documents using the Term Frequency-Inverse Document Frequency (TF-IDF) technique. TF-IDF is a commonly used method for measuring text similarity, and it’s frequently applied in natural language processing tasks, essentially.
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
class TextSimilarityTFIDF:
def __init__(self):
self.tfidf_vectorizer = TfidfVectorizer()
self.stop_words = set(stopwords.words("english"))
def preprocess(self, document):
document = document.lower()
tokens = nltk.word_tokenize(document)
tokens = [word for word in tokens if word.isalnum() and word not in self.stop_words]
return " ".join(tokens)
def calculate_cosine_similarity(self, document1, document2):
document1 = self.preprocess(document1)
document2 = self.preprocess(document2)
tfidf_matrix = self.tfidf_vectorizer.fit_transform([document1, document2])
# Calculate cosine similarity between the two documents
cosine_sim = tfidf_matrix[0].dot(tfidf_matrix[1].T).toarray()[0][0]
return cosine_sim
# example usage.
if __name__ == "__main__":
text_similarity = TextSimilarityTFIDF()
document1 = "This is a sample document about natural language processing."
document2 = "Natural language processing is a subfield of artificial intelligence."
similarity_score = text_similarity.calculate_cosine_similarity(document1, document2)
print(f"Cosine Similarity: {similarity_score}")