English.php 18 KB

Raw Blame History Permalink

<?php

namespace SearchWP\Dependencies\Wamania\Snowball;

/**
 * English Porter 2
 *
 * @link http://snowball.tartarus.org/algorithms/english/stemmer.html
 * @author wamania
 *
 */
class English extends Stem
{
    /**
     * All english vowels
     */
    protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y');
    protected static $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt');
    protected static $liEnding = array('c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't');
    /**
     * {@inheritdoc}
     */
    public function stem($word)
    {
        // we do ALL in UTF-8
        if (!Utf8::check($word)) {
            throw new \Exception('Word must be in UTF-8');
        }
        if (Utf8::strlen($word) < 3) {
            return $word;
        }
        $this->word = Utf8::strtolower($word);
        // exceptions
        if (null !== ($word = $this->exception1())) {
            return $word;
        }
        $this->plainVowels = \implode('', self::$vowels);
        // Remove initial ', if present.
        $first = Utf8::substr($this->word, 0, 1);
        if ($first == "'") {
            $this->word = Utf8::substr($this->word, 1);
        }
        // Set initial y, or y after a vowel, to Y
        if ($first == 'y') {
            $this->word = \preg_replace('#^y#u', 'Y', $this->word);
        }
        $this->word = \preg_replace('#([' . $this->plainVowels . '])y#u', '$1Y', $this->word);
        $this->r1();
        $this->exceptionR1();
        $this->r2();
        $this->step0();
        $this->step1a();
        // exceptions 2
        if (null !== ($word = $this->exception2())) {
            return $word;
        }
        $this->step1b();
        $this->step1c();
        $this->step2();
        $this->step3();
        $this->step4();
        $this->step5();
        $this->finish();
        return $this->word;
    }
    /**
     * Step 0
     * Remove ', 's, 's'
     */
    private function step0()
    {
        if (($position = $this->search(array("'s'", "'s", "'"))) !== \false) {
            $this->word = Utf8::substr($this->word, 0, $position);
        }
    }
    private function step1a()
    {
        // sses
        //      replace by ss
        if (($position = $this->search(array('sses'))) !== \false) {
            $this->word = \preg_replace('#(sses)$#u', 'ss', $this->word);
            return \true;
        }
        // ied+   ies*
        //      replace by i if preceded by more than one letter, otherwise by ie (so ties -> tie, cries -> cri)
        if (($position = $this->search(array('ied', 'ies'))) !== \false) {
            if ($position > 1) {
                $this->word = \preg_replace('#(ied|ies)$#u', 'i', $this->word);
            } else {
                $this->word = \preg_replace('#(ied|ies)$#u', 'ie', $this->word);
            }
            return \true;
        }
        // us+   ss
        //  do nothing
        if (($position = $this->search(array('us', 'ss'))) !== \false) {
            return \true;
        }
        // s
        //      delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it)
        if (($position = $this->search(array('s'))) !== \false) {
            for ($i = 0; $i < $position - 1; $i++) {
                $letter = Utf8::substr($this->word, $i, 1);
                if (\in_array($letter, self::$vowels)) {
                    $this->word = Utf8::substr($this->word, 0, $position);
                    return \true;
                }
            }
            return \true;
        }
        return \false;
    }
    /**
     * Step 1b
     */
    private function step1b()
    {
        // eed   eedly+
        //      replace by ee if in R1
        if (($position = $this->search(array('eedly', 'eed'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(eedly|eed)$#u', 'ee', $this->word);
            }
            return \true;
        }
        // ed   edly+   ing   ingly+
        //      delete if the preceding word part contains a vowel, and after the deletion:
        //      if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
        //      if the word ends with a double remove the last letter (so hopp -> hop), or
        //      if the word is short, add e (so hop -> hope)
        if (($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== \false) {
            for ($i = 0; $i < $position; $i++) {
                $letter = Utf8::substr($this->word, $i, 1);
                if (\in_array($letter, self::$vowels)) {
                    $this->word = Utf8::substr($this->word, 0, $position);
                    if ($this->search(array('at', 'bl', 'iz')) !== \false) {
                        $this->word .= 'e';
                    } elseif (($position2 = $this->search(self::$doubles)) !== \false) {
                        $this->word = Utf8::substr($this->word, 0, $position2 + 1);
                    } elseif ($this->isShort()) {
                        $this->word .= 'e';
                    }
                    return \true;
                }
            }
            return \true;
        }
        return \false;
    }
    /**
     * Step 1c: *
     */
    private function step1c()
    {
        // replace suffix y or Y by i if preceded by a non-vowel
        // which is not the first letter of the word (so cry -> cri, by -> by, say -> say)
        $length = Utf8::strlen($this->word);
        if ($length < 3) {
            return \true;
        }
        if (($position = $this->search(array('y', 'Y'))) !== \false) {
            $before = $position - 1;
            $letter = Utf8::substr($this->word, $before, 1);
            if (!\in_array($letter, self::$vowels)) {
                $this->word = \preg_replace('#(y|Y)$#u', 'i', $this->word);
            }
            return \true;
        }
        return \false;
    }
    /**
     * Step 2
     *  Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
     */
    private function step2()
    {
        // iveness   iviti:   replace by ive
        if (($position = $this->search(array('iveness', 'iviti'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(iveness|iviti)$#u', 'ive', $this->word);
            }
            return \true;
        }
        // ousli   ousness:   replace by ous
        if (($position = $this->search(array('ousli', 'ousness'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(ousli|ousness)$#u', 'ous', $this->word);
            }
            return \true;
        }
        // izer   ization:   replace by ize
        if (($position = $this->search(array('izer', 'ization'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(izer|ization)$#u', 'ize', $this->word);
            }
            return \true;
        }
        // ational   ation   ator:   replace by ate
        if (($position = $this->search(array('ational', 'ation', 'ator'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(ational|ation|ator)$#u', 'ate', $this->word);
            }
            return \true;
        }
        // biliti   bli+:   replace by ble
        if (($position = $this->search(array('biliti', 'bli'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(biliti|bli)$#u', 'ble', $this->word);
            }
            return \true;
        }
        // lessli+:   replace by less
        if (($position = $this->search(array('lessli'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(lessli)$#u', 'less', $this->word);
            }
            return \true;
        }
        // fulness:   replace by ful
        if (($position = $this->search(array('fulness', 'fulli'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(fulness|fulli)$#u', 'ful', $this->word);
            }
            return \true;
        }
        // tional:   replace by tion
        if (($position = $this->search(array('tional'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(tional)$#u', 'tion', $this->word);
            }
            return \true;
        }
        // alism   aliti   alli:   replace by al
        if (($position = $this->search(array('alism', 'aliti', 'alli'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(alism|aliti|alli)$#u', 'al', $this->word);
            }
            return \true;
        }
        // enci:   replace by ence
        if (($position = $this->search(array('enci'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(enci)$#u', 'ence', $this->word);
            }
            return \true;
        }
        // anci:   replace by ance
        if (($position = $this->search(array('anci'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(anci)$#u', 'ance', $this->word);
            }
            return \true;
        }
        // abli:   replace by able
        if (($position = $this->search(array('abli'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(abli)$#u', 'able', $this->word);
            }
            return \true;
        }
        // entli:   replace by ent
        if (($position = $this->search(array('entli'))) !== \false) {
            if ($this->inR1($position)) {
                $this->word = \preg_replace('#(entli)$#u', 'ent', $this->word);
            }
            return \true;
        }
        // ogi+:   replace by og if preceded by l
        if (($position = $this->search(array('ogi'))) !== \false) {
            if ($this->inR1($position)) {
                $before = $position - 1;
                $letter = Utf8::substr($this->word, $before, 1);
                if ($letter == 'l') {
                    $this->word = \preg_replace('#(ogi)$#u', 'og', $this->word);
                }
            }
            return \true;
        }
        // li+:   delete if preceded by a valid li-ending
        if (($position = $this->search(array('li'))) !== \false) {
            if ($this->inR1($position)) {
                // a letter for you
                $letter = Utf8::substr($this->word, $position - 1, 1);
                if (\in_array($letter, self::$liEnding)) {
                    $this->word = Utf8::substr($this->word, 0, $position);
                }
            }
            return \true;
        }
        return \false;
    }
    /**
     * Step 3:
     * Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
     */
    public function step3()
    {
        // ational+:   replace by ate
        if ($this->searchIfInR1(array('ational')) !== \false) {
            $this->word = \preg_replace('#(ational)$#u', 'ate', $this->word);
            return \true;
        }
        // tional+:   replace by tion
        if ($this->searchIfInR1(array('tional')) !== \false) {
            $this->word = \preg_replace('#(tional)$#u', 'tion', $this->word);
            return \true;
        }
        // alize:   replace by al
        if ($this->searchIfInR1(array('alize')) !== \false) {
            $this->word = \preg_replace('#(alize)$#u', 'al', $this->word);
            return \true;
        }
        // icate   iciti   ical:   replace by ic
        if ($this->searchIfInR1(array('icate', 'iciti', 'ical')) !== \false) {
            $this->word = \preg_replace('#(icate|iciti|ical)$#u', 'ic', $this->word);
            return \true;
        }
        // ful   ness:   delete
        if (($position = $this->searchIfInR1(array('ful', 'ness'))) !== \false) {
            $this->word = Utf8::substr($this->word, 0, $position);
            return \true;
        }
        // ative*:   delete if in R2
        if (($position = $this->searchIfInR1(array('ative'))) !== \false && $this->inR2($position)) {
            $this->word = Utf8::substr($this->word, 0, $position);
            return \true;
        }
        return \false;
    }
    /**
     * Step 4
     * Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated.
     */
    public function step4()
    {
        //    ement  ance   ence  able ible   ant  ment   ent   ism   ate   iti   ous   ive   ize al  er   ic
        //      delete
        if (($position = $this->search(array('ance', 'ence', 'ement', 'able', 'ible', 'ant', 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== \false) {
            if ($this->inR2($position)) {
                $this->word = Utf8::substr($this->word, 0, $position);
            }
            return \true;
        }
        // ion
        //      delete if preceded by s or t
        if (($position = $this->searchIfInR2(array('ion'))) !== \false) {
            $before = $position - 1;
            $letter = Utf8::substr($this->word, $before, 1);
            if ($letter == 's' || $letter == 't') {
                $this->word = Utf8::substr($this->word, 0, $position);
            }
            return \true;
        }
        return \false;
    }
    /**
     * Step 5: *
     * Search for the the following suffixes, and, if found, perform the action indicated.
     */
    public function step5()
    {
        // e
        //      delete if in R2, or in R1 and not preceded by a short syllable
        if (($position = $this->search(array('e'))) !== \false) {
            if ($this->inR2($position)) {
                $this->word = Utf8::substr($this->word, 0, $position);
            } elseif ($this->inR1($position)) {
                if (!$this->searchShortSyllabe(-4, 3) && !$this->searchShortSyllabe(-3, 2)) {
                    $this->word = Utf8::substr($this->word, 0, $position);
                }
            }
            return \true;
        }
        // l
        //      delete if in R2 and preceded by l
        if (($position = $this->searchIfInR2(array('l'))) !== \false) {
            $before = $position - 1;
            $letter = Utf8::substr($this->word, $before, 1);
            if ($letter == 'l') {
                $this->word = Utf8::substr($this->word, 0, $position);
            }
            return \true;
        }
        return \false;
    }
    public function finish()
    {
        $this->word = Utf8::str_replace('Y', 'y', $this->word);
    }
    private function exceptionR1()
    {
        if (Utf8::strpos($this->word, 'gener') === 0) {
            $this->r1 = Utf8::substr($this->word, 5);
            $this->r1Index = 5;
        } elseif (Utf8::strpos($this->word, 'commun') === 0) {
            $this->r1 = Utf8::substr($this->word, 6);
            $this->r1Index = 6;
        } elseif (Utf8::strpos($this->word, 'arsen') === 0) {
            $this->r1 = Utf8::substr($this->word, 5);
            $this->r1Index = 5;
        }
    }
    /**
     *  1/ Stem certain special words as follows,
     *  2/ If one of the following is found, leave it invariant,
     */
    private function exception1()
    {
        $exceptions = array(
            'skis' => 'ski',
            'skies' => 'sky',
            'dying' => 'die',
            'lying' => 'lie',
            'tying' => 'tie',
            'idly' => 'idl',
            'gently' => 'gentl',
            'ugly' => 'ugli',
            'early' => 'earli',
            'only' => 'onli',
            'singly' => 'singl',
            // invariants
            'sky' => 'sky',
            'news' => 'news',
            'howe' => 'howe',
            'atlas' => 'atlas',
            'cosmos' => 'cosmos',
            'bias' => 'bias',
            'andes' => 'andes',
        );
        if (isset($exceptions[$this->word])) {
            return $exceptions[$this->word];
        }
        return null;
    }
    /**
     * Following step 1a, leave the following invariant,
     */
    private function exception2()
    {
        $exceptions = array('inning' => 'inning', 'outing' => 'outing', 'canning' => 'canning', 'herring' => 'herring', 'earring' => 'earring', 'proceed' => 'proceed', 'exceed' => 'exceed', 'succeed' => 'succeed');
        if (isset($exceptions[$this->word])) {
            return $exceptions[$this->word];
        }
        return null;
    }
    /**
     *  A word is called short if it ends in a short syllable, and if R1 is null.
     *  Note : R1 not really null, but the word at this state must be smaller than r1 index
     *
     *  @return boolean
     */
    private function isShort()
    {
        $length = Utf8::strlen($this->word);
        return ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && $length == $this->r1Index;
    }
    /**
     * Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel,
     *  or * (b) a vowel at the beginning of the word followed by a non-vowel.
     *
     *  So rap, trap, entrap end with a short syllable, and ow, on, at are classed as short syllables.
     *  But uproot, bestow, disturb do not end with a short syllable.
     */
    private function searchShortSyllabe($from, $nbLetters)
    {
        $length = Utf8::strlen($this->word);
        if ($from < 0) {
            $from = $length + $from;
        }
        if ($from < 0) {
            $from = 0;
        }
        // (a) is just for beginning of the word
        if ($nbLetters == 2 && $from != 0) {
            return \false;
        }
        $first = Utf8::substr($this->word, $from, 1);
        $second = Utf8::substr($this->word, $from + 1, 1);
        if ($nbLetters == 2) {
            if (\in_array($first, self::$vowels) && !\in_array($second, self::$vowels)) {
                return \true;
            }
        }
        $third = Utf8::substr($this->word, $from + 2, 1);
        if (!\in_array($first, self::$vowels) && \in_array($second, self::$vowels) && !\in_array($third, \array_merge(self::$vowels, array('x', 'Y', 'w')))) {
            return \true;
        }
        return \false;
    }
}