package tsp.application; import java.util.*; public class SearchEngine { HashMap> inverseIndex; double nDocuments; public SearchEngine(){ this.inverseIndex = new HashMap<>(); this.nDocuments = 0; } public void add(Document document){ addInverseIndex(document); nDocuments += 1; } private void addInverseIndex(Document document){ String[] words = document.preprocess(); HashSet processed = new HashSet<>(); for (String word: words){ if (processed.contains(word)) continue; if (!inverseIndex.containsKey(word)){ ArrayList newAL = new ArrayList<>(); inverseIndex.put(word, newAL); } inverseIndex.get(word).add(document); processed.add(word); } } public double getFrequency(String word, Document document){ return document.getFrequency(word); } public double getIDF(String word){ if (!this.inverseIndex.containsKey(word)) return 0; return Math.log(this.nDocuments / this.inverseIndex.get(word).size()); } public double getTFIDF(String word, Document document){ return this.getFrequency(word, document) * this.getIDF(word); } private double[] getTFIDFVector(Document document, String[] words){ double[] res = new double[words.length]; for (int i = 0; i < words.length; i++){ res[i] = getTFIDF(words[i], document); } return res; } private String[] getWordsQuery(Document query){ String[] words = query.preprocess(); HashSet uniqueWords = new HashSet<>(); for (int i = 0; i < words.length; i++){ if (this.getIDF(words[i]) > 1e-4){ uniqueWords.add(words[i]); } } String[] uniqueWordsArray = uniqueWords.toArray(new String[uniqueWords.size()]); Arrays.sort(uniqueWordsArray); return uniqueWordsArray; } private Document[] getRelevantDocuments(String[] words){ HashSet relevantDocuments = new HashSet<>(); for (int i = 0; i < words.length; i++){ if (inverseIndex.containsKey(words[i])){ for (Document document: inverseIndex.get(words[i])){ relevantDocuments.add(document); } } } return relevantDocuments.toArray(new Document[relevantDocuments.size()]); } public Document search(Document query){ String[] words = this.getWordsQuery(query); Document[] relevantDocuments = this.getRelevantDocuments(words); double[] scores = new double[relevantDocuments.length]; for (int i = 0; i < scores.length; i++){ double[] vector = this.getTFIDFVector(relevantDocuments[i], words); scores[i] = 0; for (int j = 0; j < vector.length; j++) scores[i] += vector[j]; } int maxiIndex = 0; double maxiScore = -1; for (int i = 0; i < scores.length; i++){ if (scores[i]> maxiScore){ maxiIndex = i; maxiScore = scores[i]; } } return relevantDocuments[maxiIndex]; } public static void main(String[] args) { Document[] documents = { Document.fromURL("https://www.gutenberg.org/files/16/16-0.txt"), Document.fromURL("https://www.gutenberg.org/cache/epub/1513/pg1513.txt"), Document.fromURL("https://www.gutenberg.org/files/2701/2701-0.txt"), Document.fromURL("https://www.gutenberg.org/cache/epub/2641/pg2641.txt"), Document.fromURL("https://www.gutenberg.org/cache/epub/145/pg145.txt"), Document.fromURL("https://www.gutenberg.org/cache/epub/37106/pg37106.txt"), Document.fromURL("https://www.gutenberg.org/cache/epub/67979/pg67979.txt"), Document.fromURL("https://www.gutenberg.org/ebooks/16389"), Document.fromURL("https://www.gutenberg.org/cache/epub/6761/pg6761.txt"), Document.fromURL("https://www.gutenberg.org/cache/epub/394/pg394.txt"), Document.fromURL("https://www.gutenberg.org/files/2160/2160-0.txt"), Document.fromURL("https://www.gutenberg.org/files/4085/4085-0.txt"), Document.fromURL("https://www.gutenberg.org/ebooks/1342"), Document.fromURL("https://www.gutenberg.org/ebooks/6593") }; Document query = new Document("pirate island"); SearchEngine tfidf = new SearchEngine(); for (Document document: documents){ tfidf.add(document); } System.out.println(tfidf.getFrequency("the", documents[0])); System.out.println(tfidf.getFrequency("peter", documents[0])); System.out.println(tfidf.getFrequency("spread", documents[0])); System.out.println(tfidf.getIDF("the")); System.out.println(tfidf.getIDF("peter")); System.out.println(tfidf.getIDF("spread")); System.out.println(tfidf.getTFIDF("the", documents[0])); System.out.println(tfidf.getTFIDF("peter", documents[0])); System.out.println(tfidf.getTFIDF("spread", documents[0])); System.out.println(tfidf.search(query)); } }