BooleanQueryWordnet.java
BooleanQueryWordnet.java
— 6.5 KB
Dateiinhalt
// DO NOT CHANGE THIS PACKAGE NAME. package ue_inforet_bool_wordnet_study; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; public class BooleanQueryWordnet { /** * DO NOT ADD ADDITIONAL PARAMETERS TO THE SIGNATURE * OF THE CONSTRUCTOR. * */ public BooleanQueryWordnet() { // TODO you may insert code here } /** * A method for parsing the WortNet synsets. * The data.[noun, verb, adj, adv] files contain the synsets. * The [noun, verb, adj, adv].exc files contain the base forms * of irregular words. * * Please refer to * http://wordnet.princeton.edu/man/wndb.5WN.html * regarding the syntax of these plain files. * * DO NOT CHANGE THIS METHOD'S INTERFACE. * * @param wordnetDir the directory of the wordnet files */ public void buildSynsets(String wordnetDir) { // TODO: insert code here } /** * A method for reading the textual movie plot file and building a Lucene index. * The purpose of the index is to speed up subsequent boolean searches using * the {@link #booleanQuery(String) booleanQuery} method. * * DO NOT CHANGE THIS METHOD'S INTERFACE. * * @param plotFile * the textual movie plot file 'plot.list', obtainable from <a * href="http://www.imdb.com/interfaces" * >http://www.imdb.com/interfaces</a> for personal, non-commercial * use. */ public void buildIndices(String plotFile) { // TODO: insert code here } /** * A method for performing a boolean search on a textual movie plot file after * Lucene indices were built using the {@link #buildIndices(String) buildIndices} * method. The movie plot file contains entries of the <b>types</b> movie, * series, episode, television, video, and videogame. This method allows queries * following the Lucene query syntax on any of the <b>fields</b> title, plot, year, * episode, and type. Note that queries are case-insensitive and stop words are * removed.<br> * <br> * * More details on the query syntax can be found at <a * href="http://www.lucenetutorial.com/lucene-query-syntax.html"> * http://www.lucenetutorial.com/lucene-query-syntax.html</a>. * * DO NOT CHANGE THIS METHOD'S INTERFACE. * * @param queryString * the query string, formatted according to the Lucene query syntax. * @return the exact content (in the textual movie plot file) of the title * lines (starting with "MV: ") of the documents matching the query */ public Set<String> booleanQuery(String queryString) { // TODO: insert code here return new HashSet<>(); } /** * A method for closing any open file handels or a ThreadPool. * * DO NOT CHANGE THIS METHOD'S INTERFACE. */ public void close() { // TODO: you may insert code here } public static void main(String[] args) { if (args.length < 4) { System.err .println("usage: java -jar BooleanQueryWordnet.jar <plot list file> <wordnet directory> <queries file> <results file>"); System.exit(-1); } // build indices System.out.println("building indices..."); long tic = System.nanoTime(); Runtime runtime = Runtime.getRuntime(); long mem = runtime.totalMemory(); // the directory to the wordnet-files: [*.exc], [data.*] String plotFile = args[0]; String wordNetDir = args[1]; BooleanQueryWordnet bq = new BooleanQueryWordnet(); bq.buildSynsets(wordNetDir); bq.buildIndices(plotFile); System.gc(); try { Thread.sleep(10); } catch (InterruptedException e1) { e1.printStackTrace(); } System.out .println("runtime: " + (System.nanoTime() - tic) + " nanoseconds"); System.out .println("memory: " + ((runtime.totalMemory() - mem) / (1048576l)) + " MB (rough estimate)"); // parsing the queries that are to be run from the queries file List<String> queries = new ArrayList<>(); try (BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(args[2]), StandardCharsets.ISO_8859_1))) { String line; while ((line = reader.readLine()) != null) queries.add(line); } catch (IOException e) { e.printStackTrace(); System.exit(-1); } // parsing the queries' expected results from the results file List<Set<String>> results = new ArrayList<>(); try (BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(args[3]), StandardCharsets.ISO_8859_1))) { String line; while ((line = reader.readLine()) != null) { Set<String> result = new HashSet<>(); results.add(result); for (int i = 0; i < Integer.parseInt(line); i++) { result.add(reader.readLine()); } } } catch (IOException e) { e.printStackTrace(); System.exit(-1); } // run queries for (int i = 0; i < queries.size(); i++) { String query = queries.get(i); Set<String> expectedResult = i < results.size() ? results.get(i) : new HashSet<>(); System.out.println(); System.out.println("query: " + query); tic = System.nanoTime(); Set<String> actualResult = bq.booleanQuery(query); // sort expected and determined results for human readability List<String> expectedResultSorted = new ArrayList<>(expectedResult); List<String> actualResultSorted = new ArrayList<>(actualResult); Comparator<String> stringComparator = new Comparator<String>() { @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } }; expectedResultSorted.sort(stringComparator); actualResultSorted.sort(stringComparator); System.out.println("runtime: " + (System.nanoTime() - tic) + " nanoseconds."); System.out.println("expected result (" + expectedResultSorted.size() + "): " + expectedResultSorted.toString()); System.out.println("actual result (" + actualResultSorted.size() + "): " + actualResultSorted.toString()); System.out.println(expectedResult.equals(actualResult) ? "SUCCESS" : "FAILURE"); } bq.close(); } }