diff --git a/Linse.php b/Linse.php index 3b55d3ad2270df14caf48ff585c3421fb6fbcdf5..de15a554aebfb2cadfbb189b4df7a6b2672f1b76 100644 --- a/Linse.php +++ b/Linse.php @@ -21,14 +21,13 @@ class Linse { - protected static - $_instance, - $_alpha_maj_cache = []; + protected $_converter; private $articles; // Articles rejetes private $inclu; // Mots inclus private $exclu; // Mots vides private $pluriel; // Règles des pluriels + private $tableMajFrom, $tableMajTo = []; // Table de transco pour majuscules private $tableMajUtf8; // Table de transco pour majuscules accentuées utf8 @@ -43,10 +42,27 @@ class Linse { 'ú' => 'Ú', 'ù' => 'Ù', 'û' => 'Û', 'ü' => 'Ü','ç' => 'Ç', 'ñ' => 'Ñ', 'ß' => 'S']; - public static function getInstance() { - if(!static::$_instance) - static::$_instance = new static(); - return static::$_instance; + public static function forIso5426() { + return (new static())->beForIso5426(); + } + + + public function beForIso5426() { + $this->_converter = new Linse_Iso5426Converter(); + return $this; + } + + + public function beForUtf8() { + $this->_converter = new Linse_Utf8Converter(); + return $this; + } + + + public function getConverter() { + return $this->_converter + ? $this->_converter + : ($this->_converter = new Linse_Utf8Converter()); } @@ -106,56 +122,6 @@ class Linse { ['*AU','*AUX'], ['PC','PC'], ['DS','DS']]; - - // Init table ascii pour majuscules - $this->tableMajTo = str_split(str_repeat( ' ', 42 ) - . '* 0123456789 ' - . 'ABCDEFGHIJKLMNOPQRSTUVWXYZ ' - . 'ABCDEFGHIJKLMNOPQRSTUVWXYZ ' - . str_repeat( ' ', 63) - .'AAAAAAACEEEEIIII NOOOOO UUUUY AAAAAAACEEEEIIII NOOOOO UUUUY Y'); - - for($i=0; $i<count($this->tableMajTo); $i++) - $this->tableMajFrom[] = chr($i); - - - $this->tableMajUtf8 = ['É' => 'E', - 'È' => 'E', - 'Ë' => 'E', - 'Ê' => 'E', - 'Ã' => 'A', - 'À' => 'A', - 'Ä' => 'A', - 'Â' => 'A', - 'Ã…' => 'A', - 'Ã' => 'A', - 'Æ' => 'AE', - 'Ã' => 'I', - 'ÃŽ' => 'I', - 'ÃŒ' => 'I', - 'Ã' => 'I', - 'Ô' => 'O', - 'Ö' => 'O', - 'Ã’' => 'O', - 'Ó' => 'O', - 'Õ' => 'O', - 'Ø' => 'O', - 'Å’' => 'OE', - 'Ú' => 'U', - 'Ù' => 'U', - 'Û' => 'U', - 'Ü' => 'U', - 'Ñ' => 'N', - 'Ç' => 'C', - '¿' => '', - 'Å' => 'L', - 'Å»' => 'Z', - 'Ä' => 'D', // d slash upper != eth upper - 'IJ' => 'IJ', - 'Þ' => 'TH', - 'Ã' => 'D', // eth upper != d slash upper - 'ß' => 'SS', - ]; } @@ -185,7 +151,7 @@ class Linse { public function getExpressionRecherche($mot) { - if (!$mot=trim($mot)) + if (!$mot = trim($mot)) return false; $etoile = ''; @@ -201,36 +167,26 @@ class Linse { public function alphaMaj($chaine) { - $chaine = mb_strtoupper($chaine); - - if (isset(static::$_alpha_maj_cache[$chaine])) - return static::$_alpha_maj_cache[$chaine]; - - $chaine = str_replace(array_keys($this->tableMajUtf8), array_values($this->tableMajUtf8), $chaine); - $chaine = utf8_decode($chaine); - - return static::$_alpha_maj_cache[$chaine] = trim(str_replace($this->tableMajFrom, - $this->tableMajTo, - $chaine)); + return $this->getConverter()->toIndexable($chaine); } public function getClefAlpha($type_doc,$titre,$complement_titre,$auteur,$tome,$editeur,$annee) { - $clef=$this->getClefOeuvre($titre,$complement_titre,$auteur,$tome).'-'; - $clef.=substr($this->alphaMaj(str_replace(' ','',$editeur)),0,80).'-'; - $clef.=$annee.'-'; - $clef.=$type_doc; - $clef=str_replace(' ','',$clef); + $clef = $this->getClefOeuvre($titre,$complement_titre,$auteur,$tome).'-'; + $clef .= substr($this->alphaMaj(str_replace(' ','',$editeur)),0,80).'-'; + $clef .= $annee . '-'; + $clef .= $type_doc; + $clef = str_replace(' ', '', $clef); return $clef; } public function getClefOeuvre($titre,$complement_titre,$auteur,$tome) { $clef = substr($this->codeAlphaTitre(strtoupper(str_replace(' ','',$titre))),0,80).'-'; - $clef.=substr($this->codeAlphaTitre(strtoupper(str_replace(' ','',$complement_titre))),0,20).'-'; - $clef.=substr($this->alphaMaj(str_replace(' ','',$auteur)),0,80).'-'; - $clef.=$this->alphaMaj($tome); - $clef=str_replace(' ','',$clef); + $clef .= substr($this->codeAlphaTitre(strtoupper(str_replace(' ','',$complement_titre))),0,20).'-'; + $clef .= substr($this->alphaMaj(str_replace(' ','',$auteur)),0,80).'-'; + $clef .= $this->alphaMaj($tome); + $clef = str_replace(' ', '', $clef); return $clef; } diff --git a/Linse/Converter.php b/Linse/Converter.php new file mode 100644 index 0000000000000000000000000000000000000000..f71b6a35c7474102e7c256ec534599b6bfa4bd0b --- /dev/null +++ b/Linse/Converter.php @@ -0,0 +1,44 @@ +<?php +/** + * Copyright (c) 2012-2016, Agence Française Informatique (AFI). All rights reserved. + * + * LINSE is free software; you can redistribute it and/or modify + * it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE as published by + * the Free Software Foundation. + * + * There are special exceptions to the terms and conditions of the AGPL as it + * is applied to this software (see README file). + * + * LINSE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU AFFERO GENERAL PUBLIC LICENSE for more details. + * + * You should have received a copy of the GNU AFFERO GENERAL PUBLIC LICENSE + * along with LINSE; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +class Linse_Converter { + protected $_ascii_to_uppercase = []; + protected $_ascii_map = []; + + + public function __construct() { + $this->_ascii_to_uppercase = str_split(str_repeat(' ', 42) + . '* 0123456789 ' + . 'ABCDEFGHIJKLMNOPQRSTUVWXYZ ' + . 'ABCDEFGHIJKLMNOPQRSTUVWXYZ ' + . str_repeat(' ', 63) + .'AAAAAAACEEEEIIII NOOOOO UUUUY AAAAAAACEEEEIIII NOOOOO UUUUY Y'); + + for ($i=0; $i < count($this->_ascii_to_uppercase); $i++) + $this->_ascii_map[] = chr($i); + } + + + public function toIndexable($data) { + return trim(str_replace($this->_ascii_map, $this->_ascii_to_uppercase, $data)); + } +} \ No newline at end of file diff --git a/Linse/Iso5426Converter.php b/Linse/Iso5426Converter.php index d1f96c593911c9fc8f4cb14e6a1686b944876de5..8133961ed03a77c282e176126f8c1071e70595f6 100644 --- a/Linse/Iso5426Converter.php +++ b/Linse/Iso5426Converter.php @@ -20,11 +20,54 @@ */ -class Linse_Iso5426Converter { +class Linse_Iso5426Converter extends Linse_Converter { + public function toIndexable($data) { + $data = str_replace([chr(136), chr(137)], '', $data); // Les delimiteurs d'article bnf + + $result = ''; + $len = strlen($data); + for ($i=0; $i < $len; $i++) { + $ord = ord($data[$i]); + if (0xe0 < $ord) { + $result .= $this->charToIndexable($ord); + continue; + } + + $result .= $ord == 0x80 ? '€' : $data[$i]; + } + + $result = preg_replace('/[' . chr(123) . '-' . chr(255) .']/', '', $result); + return parent::toIndexable($result); + } + + + public function charToIndexable($c) { + $map = [0xe1 => 'AE', + 0xe2 => 'D', + 0xe6 => 'IJ', + 0xe8 => 'L', + 0xe9 => 'O', + 0xea => 'OE', + 0xec => 'TH', + 0xf1 => 'ae', + 0xf2 => 'd', + 0xf3 => 'd', + 0xf5 => 'i', + 0xf6 => 'ij', + 0xf8 => 'l', + 0xf9 => 'o', + 0xfa => 'oe', + 0xfb => 'ss', + 0xfc => 'th']; + + return array_key_exists($c, $map) ? $map[$c] : $c; + } + + public function toUnicode($data) { $data = str_replace([chr(136), chr(137)], '', $data); // Les delimiteurs d'article bnf - if (!preg_match("/[\xC1-\xFF]./misU", $data)) + if (!preg_match("/[\xC0-\xFF]/misU", $data)) return $data; $result = ''; @@ -393,7 +436,8 @@ class Linse_Iso5426Converter { if (array_key_exists($combined_chars, $combined)) return html_entity_decode('&#'. $combined[$combined_chars] . ';', ENT_NOQUOTES, 'UTF-8'); - $single = [0xe1 => 'Æ', + $single = [0xbd => 'ʹ', + 0xe1 => 'Æ', 0xe2 => 'Ä', 0xe6 => 'IJ', 0xe8 => 'Å', @@ -412,9 +456,7 @@ class Linse_Iso5426Converter { 0xfc => 'þ', ]; - if (array_key_exists($c1, $single)) - return $single[$c1] . chr($c2); - - return '?'; + return (array_key_exists($c1, $single) ? $single[$c1] : '?') + . chr($c2); } } \ No newline at end of file diff --git a/Linse/Utf8Converter.php b/Linse/Utf8Converter.php new file mode 100644 index 0000000000000000000000000000000000000000..2f21cfbd38f65e4b70c88ac3ee7768d8163126f5 --- /dev/null +++ b/Linse/Utf8Converter.php @@ -0,0 +1,72 @@ +<?php +/** + * Copyright (c) 2012-2014, Agence Française Informatique (AFI). All rights reserved. + * + * BOKEH is free software; you can redistribute it and/or modify + * it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE as published by + * the Free Software Foundation. + * + * There are special exceptions to the terms and conditions of the AGPL as it + * is applied to this software (see README file). + * + * LINSE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU AFFERO GENERAL PUBLIC LICENSE for more details. + * + * You should have received a copy of the GNU AFFERO GENERAL PUBLIC LICENSE + * along with LINSE; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +class Linse_Utf8Converter extends Linse_Converter { + protected $tableMajUtf8 = ['É' => 'E', + 'È' => 'E', + 'Ë' => 'E', + 'Ê' => 'E', + 'Ã' => 'A', + 'À' => 'A', + 'Ä' => 'A', + 'Â' => 'A', + 'Ã…' => 'A', + 'Ã' => 'A', + 'Æ' => 'AE', + 'Ã' => 'I', + 'ÃŽ' => 'I', + 'ÃŒ' => 'I', + 'Ã' => 'I', + 'Ô' => 'O', + 'Ö' => 'O', + 'Ã’' => 'O', + 'Ó' => 'O', + 'Õ' => 'O', + 'Ø' => 'O', + 'Å’' => 'OE', + 'Ú' => 'U', + 'Ù' => 'U', + 'Û' => 'U', + 'Ü' => 'U', + 'Ñ' => 'N', + 'Ç' => 'C', + '¿' => '', + 'Å' => 'L', + 'Å»' => 'Z', + 'Ä' => 'D', // d slash upper != eth upper + 'IJ' => 'IJ', + 'Þ' => 'TH', + 'Ã' => 'D', // eth upper != d slash upper + 'ß' => 'SS', + ]; + + + public function toIndexable($data) { + $data = mb_strtoupper($data); + $data = str_replace(array_keys($this->tableMajUtf8), + array_values($this->tableMajUtf8), + $data); + $data = utf8_decode($data); + + return parent::toIndexable($data); + } +} \ No newline at end of file