WordCounterImpl.java

/*
 * Copyright 2010-2019 James Pether Sörling
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *	$Id$
 *  $HeadURL$
*/
package com.hack23.cia.service.impl.action.user.wordcount;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import org.springframework.stereotype.Service;

import com.hack23.cia.model.external.riksdagen.documentcontent.impl.DocumentContentData;

import smile.nlp.SimpleCorpus;
import smile.nlp.dictionary.EnglishPunctuations;
import smile.nlp.tokenizer.SimpleSentenceSplitter;
import smile.nlp.tokenizer.SimpleTokenizer;

/**
 * The Class WordCounterImpl.
 */
@Service
final class WordCounterImpl implements WordCounter {

	/**
	 * Instantiates a new word counter impl.
	 */
	public WordCounterImpl() {
		super();
	}

	public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) {

		final String html = documentContentData.getContent();
		
		final SimpleCorpus simpleCorpus = new SimpleCorpus(SimpleSentenceSplitter.getInstance(), new SimpleTokenizer(),
				new SwedishStopWords(), EnglishPunctuations.getInstance());

		simpleCorpus.add(documentContentData.getId(), documentContentData.getId(), Jsoup.clean(html, Whitelist.basic()));

		final Iterator<String> terms = simpleCorpus.getTerms();

		final Map<String, Integer> result = new HashMap<>();
		while (terms.hasNext()) {
			final String term = terms.next();
			result.put(term, simpleCorpus.getTermFrequency(term));
		}
		return result;
	}

}