Counting word occurence in docx file

1. Add apache poi to your pom.xml

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.7-beta3</version>
</dependency>

 

2. Create a functions to parse the docx file

package com.amudabadmus.docxcounter;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class WordCounter
{
   public String parseDOCX(String fileNameorFilePath )
   {
      try {
         XWPFDocument docx = new XWPFDocument(new FileInputStream(fileNameorFilePath));
         XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(docx);
         return xwpfWordExtractor.getText();
      }
      catch ( Exception error )
      {
         throw  new RuntimeException(error);
      }
   }
   public static void main (String[] args) throws Exception
   {
      WordCounter wordCounter = new WordCounter();
      String data = "C:\\Users\\User\\Desktop\\2\\java4nigeria\\page1.docx";
      System.out.println(wordCounter.parseDOCX(data));
   }
}

 

Docx content

2017-04-21 at 09-19-08.png

 

 

Output

2017-04-21 at 09-23-14.png

 

 

3. Creata a funnction to write the parsed docx file to .txt

package com.amudabadmus.awfa;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
public class WordCounter
{
   public String parseDOCX(String fileNameorFilePath )
   {
      try {
         XWPFDocument docx = new XWPFDocument(new FileInputStream(fileNameorFilePath));
         XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(docx);
         return xwpfWordExtractor.getText();
      }
      catch ( Exception error )
      {
         throw  new RuntimeException(error);
      }
   }
   public void writeDocxToTxt(String newTxtName, String parsedDocx) throws Exception
   {
      BufferedWriter bw = new BufferedWriter(new FileWriter(new File("Parsed Docx to Txt")));
      bw.write(parsedDocx);
      if(bw!=null)
         bw.close();
      System.out.printf("%s parsed and saved succesfully....", newTxtName);
   }
   public static void main (String[] args) throws Exception
   {
      WordCounter wordCounter = new WordCounter();
      String data = "C:\\Users\\User\\Desktop\\2\\java4nigeria\\page1.docx";
      //System.out.println(wordCounter.parseDOCX(data));
      String newTxt = "Amuda";
      wordCounter.writeDocxToTxt(newTxt,wordCounter.parseDOCX("data"));
   }
}

 

Output…

2017-04-21 at 09-28-06.png

4 . Create a Lambda function to count the occurrence of each word in the parsed docx.

package com.amudabadmus.awfa;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.nio.file.*;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class WordCounter
{
   public String parseDOCX(String fileNameorFilePath )
   {
      try {
         XWPFDocument docx = new XWPFDocument(new FileInputStream(fileNameorFilePath));
         XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(docx);
         return xwpfWordExtractor.getText();
      }
      catch ( Exception error )
      {
         throw  new RuntimeException(error);
      }
   }
   public void writeDocxToTxt(String newTxtName, String parsedDocx) throws Exception
   {
      BufferedWriter bw = new BufferedWriter(new FileWriter(new File(newTxtName)));
      bw.write(parsedDocx);
      if(bw!=null)
         bw.close();
      System.out.printf("%s parsed and saved succesfully....", newTxtName);
   }

   public void analyzeDOCX( String fileSource ) throws Exception
   {
      Pattern pattern = Pattern.compile("\\s+");
      Map<String, Long> wordCounts =
            Files.lines(Paths.get(fileSource))
                  .map(line -> line.replaceAll("(?!')\\p{P}", ""))
                  .flatMap(line -> pattern.splitAsStream(line))
                  .collect(Collectors.groupingBy(String::toLowerCase,
                        TreeMap::new, Collectors.counting()));
      wordCounts.entrySet()
            .stream()
            .collect(
                  Collectors.groupingBy(entry -> entry.getKey().charAt(0),
                        TreeMap::new, Collectors.toList()))
            .forEach((letter, wordList) ->
            {
               System.out.printf("%n%C%n", letter);
               wordList.stream().forEach(word -> System.out.printf(
                     "%13s: %d%n", word.getKey(), word.getValue()));
            });
   }

   public static void main (String[] args) throws Exception
   {
      WordCounter wordCounter = new WordCounter();
      String data = "C:\\Users\\User\\Desktop\\2\\java4nigeria\\page1.docx";
      String newTxt = "Amuda";
      //System.out.println(wordCounter.parseDOCX(data));
      //wordCounter.writeDocxToTxt("Amuda",wordCounter.parseDOCX(data));
      wordCounter.analyzeDOCX(newTxt);
   }
}

 

Output

2017-04-21 at 09-32-36

I hope this make your day better. For more information, check out the source code

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Create a free website or blog at WordPress.com.

Up ↑

%d bloggers like this: