We will extract distinct words from a given file using Java.
Concepts
Set data structure does not allow duplicate elements, so it can be used for filtering out duplicate words.
Using regex we can split the given text file into words, Java provides StringTokenizer class that can help splitting each line of file.
We need to close any input file so as to avoid file handle leaks inside Java program. try with resource takes care of automatically closing the underlying input stream once block of code is executed.
import java.io.*;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
public class DistinctWords {
private static final Logger LOGGER = Logger.getLogger("DistinctWords");
public Set<String> getDistinctWords(String fileName) {
Set<String> wordSet = new HashSet<>();
try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName)))) {
String line;
while ((line = br.readLine()) != null) {
StringTokenizer st = new StringTokenizer(line, " ,.;:\"");
while (st.hasMoreTokens()) {
wordSet.add(st.nextToken().toLowerCase());
}
}
} catch (IOException e) {
LOGGER.log(Level.SEVERE, "IOException occurred", e);
}
return wordSet;
}
public static void main(String[] args) {
DistinctWords distinctFileWords = new DistinctWords();
Set<String> wordList = distinctFileWords.getDistinctWords("<path-to-file>");
for (String str : wordList) {
System.out.println(str);
}
}
}
No comments:
Post a Comment
Note: Only a member of this blog may post a comment.