From 230abf98bcbedf32e642e532e6a6831bf409c21d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?ANDRE=20s=C3=A9bastien?= <sandre@afi-sa.fr> Date: Mon, 17 Feb 2025 09:50:19 +0100 Subject: [PATCH] hotline#209619 : use mb_converter and normalize for encoding indexation --- VERSIONS_HOTLINE/209619 | 1 + library/Class/CharSet.php | 10 ++++++ library/Class/Indexation.php | 23 ++----------- .../Class/Indexation/SpecialFormatTest.php | 32 +++++++++++++++++++ .../Class/Indexation/unicode_format.txt | 1 + tests/library/Class/MoteurRechercheTest.php | 4 +-- 6 files changed, 48 insertions(+), 23 deletions(-) create mode 100644 VERSIONS_HOTLINE/209619 create mode 100644 tests/library/Class/Indexation/SpecialFormatTest.php create mode 100644 tests/library/Class/Indexation/unicode_format.txt diff --git a/VERSIONS_HOTLINE/209619 b/VERSIONS_HOTLINE/209619 new file mode 100644 index 00000000000..444f3823bb9 --- /dev/null +++ b/VERSIONS_HOTLINE/209619 @@ -0,0 +1 @@ + - correctif #209619 : Recherche : Indexation des charactères spéciaux pour éviter de séparer les mots en deux. \ No newline at end of file diff --git a/library/Class/CharSet.php b/library/Class/CharSet.php index 83d2f46a898..44099157ca1 100644 --- a/library/Class/CharSet.php +++ b/library/Class/CharSet.php @@ -45,4 +45,14 @@ class Class_CharSet { return iconv('UTF-8', 'ISO-8859-1', ($words ?? '')); } + + + public static function tryConvertToUtf8(string $words): string + { + $words = Normalizer::normalize($words); + + return preg_match('`&[A-Za-z]+;`', $words) + ? mb_convert_encoding($words, 'UTF-8', 'HTML-ENTITIES') + : $words; + } } diff --git a/library/Class/Indexation.php b/library/Class/Indexation.php index dffef5ace82..2bae9f68459 100644 --- a/library/Class/Indexation.php +++ b/library/Class/Indexation.php @@ -129,21 +129,6 @@ class Class_Indexation { 'Ž' => [self::INDEXATION => 'Z', self::PHONETIX => 'Z'] ]; - protected static array $_html_conv = ['â' => 'a', - 'à' => 'a', - 'é' => 'e', - 'ê' => 'e', - 'è' => 'e', - 'ë' => 'e', - 'î' => 'i', - 'ï' => 'i', - 'ô' => 'o', - 'œ' => 'oe', - 'û' => 'u', - 'ù' => 'u', - 'ü' => 'u', - 'ç' => 'c']; - protected static $_instance; protected static array $_alpha_maj_cache = []; @@ -390,18 +375,14 @@ class Class_Indexation { if ('' === $expression) return ''; - $expression = str_replace(array_keys(static::$_html_conv), - array_values(static::$_html_conv), - $expression); - $expression = preg_replace('/&[A-Za-z]+;/i', ' ', $expression); + $expression = Class_CharSet::tryConvertToUtf8($expression); $expression = str_replace(array_keys(static::$_min_to_maj), array_map(fn($values) => $values[$key], static::$_min_to_maj), $expression); - $expression = Class_CharSet::fromISOtoUTF8(strtoupper($expression)); - $expression = preg_replace('/[^A-Z0-9]/', ' ', $expression); + $expression = preg_replace('/[^A-Z0-9]/', ' ', strtoupper($expression)); return trim(preg_replace('/\s+/', ' ', $expression)); } diff --git a/tests/library/Class/Indexation/SpecialFormatTest.php b/tests/library/Class/Indexation/SpecialFormatTest.php new file mode 100644 index 00000000000..e5f7debd302 --- /dev/null +++ b/tests/library/Class/Indexation/SpecialFormatTest.php @@ -0,0 +1,32 @@ +<?php +/** + * Copyright (c) 2012-2025, Agence Française Informatique (AFI). All rights reserved. + * + * BOKEH is free software; you can redistribute it and/or modify + * it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE as published by + * the Free Software Foundation. + * + * There are special exceptions to the terms and conditions of the AGPL as it + * is applied to this software (see README file). + * + * BOKEH is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU AFFERO GENERAL PUBLIC LICENSE for more details. + * + * You should have received a copy of the GNU AFFERO GENERAL PUBLIC LICENSE + * along with BOKEH; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +class Class_Indexation_SpecialFormatTest extends ModelTestCase +{ + + /** @test */ + public function withFormatUnicodeShouldConvertAccentWithoutEmptyChar() + { + $chaine = file_get_contents(realpath(__DIR__) . '/unicode_format.txt'); + $this->assertEquals('MATERIALITE', (new Class_Indexation)->alphaMaj($chaine)); + } +} diff --git a/tests/library/Class/Indexation/unicode_format.txt b/tests/library/Class/Indexation/unicode_format.txt new file mode 100644 index 00000000000..f731ea884ad --- /dev/null +++ b/tests/library/Class/Indexation/unicode_format.txt @@ -0,0 +1 @@ +mateÌrialiteÌ diff --git a/tests/library/Class/MoteurRechercheTest.php b/tests/library/Class/MoteurRechercheTest.php index 514937445fa..900995dba80 100644 --- a/tests/library/Class/MoteurRechercheTest.php +++ b/tests/library/Class/MoteurRechercheTest.php @@ -481,7 +481,7 @@ class MoteurRechercheSimpleWithOtherIndexFieldsTest extends MoteurRechercheSimpl * * @test */ - public function lancerRechercheSimpleShouldBe(array $params, string $sql) + public function lancerRechercheSimpleBakounineShouldBe(array $params, string $sql) { $this->mockReqProfilAndZone($params); @@ -1082,7 +1082,7 @@ class MoteurRechercheCountWordsTest extends MoteurRechercheTestCase * * @test */ - public function lancerRechercheSimpleShouldBe(array $params, int $count_words) + public function lancerRechercheSimpleCountWordsShouldBe(array $params, int $count_words) { $this->mockReqProfilAndZone($params); -- GitLab