<?php
/**
* Copyright 2011 Marc Ermshaus
*
* This code is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* This code is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this code. If not, see <http://www.gnu.org/licenses/>.
*/
/**
*
* @version 2011-Feb-13
* @author Marc Ermshaus <http://www.ermshaus.org/>
*/
class TextComparator_Item
{
const FROM_FILE = 0x01;
const STRIP_HTML = 0x02;
/** @var array */
protected $stopwords = array();
/** @var int */
protected $minimumTokenLength = 3;
/** @var array */
protected $tokens = array();
/**
*
* @param string $input
* @param int $flags
*/
public function loadText($input, $flags = null)
{
if ($flags === null) {
$flags = self::FROM_FILE | self::STRIP_HTML;
}
if ($flags & self::FROM_FILE) {
$input = file_get_contents($input);
}
if ($flags & self::STRIP_HTML) {
$input = strip_tags($input);
}
$this->tokenize($input);
$this->groupByToken();
}
/**
*
* @param string $input
*/
protected function tokenize($input)
{
// Everything consisting of x+ letters shall be indexed as a word
$input = mb_strtolower($input);
$cleaned = trim(preg_replace('/(?:^|(.+?))(?:(\p{L}{'
. $this->minimumTokenLength . ',})|$)/su', ' $2', $input));
$this->tokens = explode(' ', $cleaned);
$this->tokens = array_flip($this->tokens);
// Remove stopwords
foreach ($this->tokens as $key => $unused) {
if (isset($this->stopwords[$key])) {
unset($this->tokens[$key]);
}
}
}
/**
*
*/
protected function groupByToken()
{
$grouped = array();
foreach ($this->tokens as $token => $unused) {
if (isset($grouped[$token])) {
$grouped[$token]++;
} else {
$grouped[$token] = 1;
}
}
$this->tokens = $grouped;
}
/**
*
* @return array
*/
public function getStopwords()
{
return $this->stopwords;
}
/**
*
* @param array $stopwords
*/
public function setStopwords(array $stopwords)
{
$this->stopwords = $stopwords;
}
/**
*
* @return int
*/
public function getMinimumTokenLength()
{
return $this->minimumTokenLength;
}
/**
*
* @param int $minimumTokenLength
*/
public function setMinimumTokenLength($minimumTokenLength)
{
$this->minimumTokenLength = (int) $minimumTokenLength;
}
/**
*
* @return array
*/
public function getTokens()
{
return $this->tokens;
}
}
/**
*
* @version 2011-Feb-13
* @author Marc Ermshaus <http://www.ermshaus.org/>
*/
class TextComparator
{
/** @var array */
protected $stopwords = array();
/** @var array */
protected $pointCache = array();
/**
*
* @param array $stopwords
*/
public function __construct(array $stopwords = array())
{
$this->stopwords = $stopwords;
}
/**
*
* @param TextComparator_Item $original
* @param TextComparator_Item $compareTo
* @return int
*/
public function compare(TextComparator_Item $original,
TextComparator_Item $compareTo)
{
$points = 0;
// Calculated points for two texts will be cached internally because the
// cache uses hardly any space, so it does no harm to do so
$cacheHit = false;
foreach ($this->pointCache as $cacheEntry) {
if ($cacheEntry['original'] === $original
&& $cacheEntry['compareTo'] === $compareTo
) {
$points = $cacheEntry['points'];
$cacheHit = true;
break;
}
}
if (!$cacheHit) {
$originalTokens = $original->getTokens();
$compareToTokens = $compareTo->getTokens();
foreach ($originalTokens as $token => $amount) {
if (isset($compareToTokens[$token])) {
/*$points += 3;*/
//$points += (($amount < $textCompareTo[$token]) ? $amount : $textCompareTo[$token]);
$points += $compareToTokens[$token];
}
}
$points = (int) round($points * 1000 / count($originalTokens));
$this->pointCache[] = array(
'original' => $original,
'compareTo' => $compareTo,
'points' => $points
);
}
return $points;
}
/**
* Factory method to return a TextComparator_Item instance with sane
* defaults set
*
* @param string $input
* @param int $flags Combination of TextComparator_Item flags
* @return TextComparator_Item
*/
public function createNewItem($input, $flags = null)
{
$item = new TextComparator_Item();
$item->setStopwords($this->stopwords);
$item->loadText($input, $flags);
return $item;
}
}
// Setup
error_reporting(-1);
mb_internal_encoding('UTF-8');
header('Content-Type: text/html; charset=UTF-8');
// Load stopwords. We are using hashed arrays for all expensive calculations
// as they allow for fast lookup (O(1)). This should be a classic time-memory
// tradeoff
$stopwords = array_flip(file('./data/stopwords.txt',
FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES));
// Initialize comparator instance
$tc = new TextComparator($stopwords);
// Holds all loaded texts
$allTexts = array();
// Load and tokenize all texts in subdirectory
$texts = glob('./texts/*.txt');
$matrix = array();
foreach ($texts as $file) {
$allTexts[pathinfo($file, PATHINFO_FILENAME)] = $tc->createNewItem($file);
}
// Compare all texts in subdirectory with each other
foreach ($allTexts as $file1 => $t1) {
foreach ($allTexts as $file2 => $t2) {
if ($file1 !== $file2) {
$matrix[$file1][$file2] = $tc->compare($t1, $t2);
} else {
$matrix[$file1][$file2] = '-';
}
}
}
// Render the results
echo '<table border="1">';
echo '<tr>';
echo '<th>Text/CompareTo</th>';
foreach ($matrix as $file1 => $cols) {
foreach ($cols as $file2 => $unused) {
echo '<th style="font-size: 10px;font-weight:normal;width: 60px;">'
. pathinfo($file2, PATHINFO_FILENAME) . '</th>';
}
break;
}
echo '</tr>';
foreach ($matrix as $file1 => $cols) {
echo '<tr>';
echo '<th>' . $file1 . '</th>';
foreach ($cols as $file2 => $points) {
echo '<td style="width: 60px;">' . $points . '</td>';
}
echo '</tr>';
}
echo '</table>';
if (function_exists('xdebug_time_index')) {
echo '<p>Current execution time: ' . round(xdebug_time_index(), 2) . ' s</p>';
}
// Show top 3 similar texts for each text
foreach ($allTexts as $file1 => $text1) {
$points = array();
foreach ($allTexts as $file2 => $text2) {
if ($file1 !== $file2) {
$points[$file2] = $tc->compare($text1, $text2);
}
}
arsort($points);
$tmp = array_slice($points, 0, 3);
echo '<p>Similar texts for ' . $file1 . ':</p>';
echo '<ol>';
foreach ($tmp as $file => $points) {
echo '<li>' . $file . ' (' . $points . ')</li>';
}
echo '</ol>';
}
if (function_exists('xdebug_time_index')) {
echo '<p>Current execution time: ' . round(xdebug_time_index(), 2) . ' s</p>';
}