From 230abf98bcbedf32e642e532e6a6831bf409c21d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ANDRE=20s=C3=A9bastien?= <sandre@afi-sa.fr>
Date: Mon, 17 Feb 2025 09:50:19 +0100
Subject: [PATCH] hotline#209619 : use mb_converter and normalize for encoding
 indexation

---
 VERSIONS_HOTLINE/209619                       |  1 +
 library/Class/CharSet.php                     | 10 ++++++
 library/Class/Indexation.php                  | 23 ++-----------
 .../Class/Indexation/SpecialFormatTest.php    | 32 +++++++++++++++++++
 .../Class/Indexation/unicode_format.txt       |  1 +
 tests/library/Class/MoteurRechercheTest.php   |  4 +--
 6 files changed, 48 insertions(+), 23 deletions(-)
 create mode 100644 VERSIONS_HOTLINE/209619
 create mode 100644 tests/library/Class/Indexation/SpecialFormatTest.php
 create mode 100644 tests/library/Class/Indexation/unicode_format.txt

diff --git a/VERSIONS_HOTLINE/209619 b/VERSIONS_HOTLINE/209619
new file mode 100644
index 00000000000..444f3823bb9
--- /dev/null
+++ b/VERSIONS_HOTLINE/209619
@@ -0,0 +1 @@
+ - correctif #209619 : Recherche : Indexation des charactères spéciaux pour éviter de séparer les mots en deux.
\ No newline at end of file
diff --git a/library/Class/CharSet.php b/library/Class/CharSet.php
index 83d2f46a898..44099157ca1 100644
--- a/library/Class/CharSet.php
+++ b/library/Class/CharSet.php
@@ -45,4 +45,14 @@ class Class_CharSet
   {
     return iconv('UTF-8', 'ISO-8859-1', ($words ?? ''));
   }
+
+
+  public static function tryConvertToUtf8(string $words): string
+  {
+    $words = Normalizer::normalize($words);
+
+    return preg_match('`&[A-Za-z]+;`', $words)
+      ? mb_convert_encoding($words, 'UTF-8', 'HTML-ENTITIES')
+      : $words;
+  }
 }
diff --git a/library/Class/Indexation.php b/library/Class/Indexation.php
index dffef5ace82..2bae9f68459 100644
--- a/library/Class/Indexation.php
+++ b/library/Class/Indexation.php
@@ -129,21 +129,6 @@ class Class_Indexation {
      'Ž' => [self::INDEXATION => 'Z', self::PHONETIX => 'Z']
     ];
 
-  protected static array $_html_conv = ['&acirc;' => 'a',
-                                        '&agrave;' => 'a',
-                                        '&eacute;' => 'e',
-                                        '&ecirc;' => 'e',
-                                        '&egrave;' => 'e',
-                                        '&euml;' => 'e',
-                                        '&icirc;' => 'i',
-                                        '&iuml;' => 'i',
-                                        '&ocirc;' => 'o',
-                                        '&oelig;' => 'oe',
-                                        '&ucirc;' => 'u',
-                                        '&ugrave;' => 'u',
-                                        '&uuml;' => 'u',
-                                        '&ccedil;' => 'c'];
-
   protected static $_instance;
   protected static array $_alpha_maj_cache = [];
 
@@ -390,18 +375,14 @@ class Class_Indexation {
     if ('' === $expression)
       return '';
 
-    $expression = str_replace(array_keys(static::$_html_conv),
-                              array_values(static::$_html_conv),
-                              $expression);
-    $expression = preg_replace('/&[A-Za-z]+;/i', ' ', $expression);
+    $expression = Class_CharSet::tryConvertToUtf8($expression);
 
     $expression = str_replace(array_keys(static::$_min_to_maj),
                               array_map(fn($values) => $values[$key],
                                         static::$_min_to_maj),
                               $expression);
 
-    $expression = Class_CharSet::fromISOtoUTF8(strtoupper($expression));
-    $expression = preg_replace('/[^A-Z0-9]/', ' ', $expression);
+    $expression = preg_replace('/[^A-Z0-9]/', ' ', strtoupper($expression));
 
     return trim(preg_replace('/\s+/', ' ', $expression));
   }
diff --git a/tests/library/Class/Indexation/SpecialFormatTest.php b/tests/library/Class/Indexation/SpecialFormatTest.php
new file mode 100644
index 00000000000..e5f7debd302
--- /dev/null
+++ b/tests/library/Class/Indexation/SpecialFormatTest.php
@@ -0,0 +1,32 @@
+<?php
+/**
+ * Copyright (c) 2012-2025, Agence Française Informatique (AFI). All rights reserved.
+ *
+ * BOKEH is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE as published by
+ * the Free Software Foundation.
+ *
+ * There are special exceptions to the terms and conditions of the AGPL as it
+ * is applied to this software (see README file).
+ *
+ * BOKEH is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU AFFERO GENERAL PUBLIC LICENSE for more details.
+ *
+ * You should have received a copy of the GNU AFFERO GENERAL PUBLIC LICENSE
+ * along with BOKEH; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
+ */
+
+
+class Class_Indexation_SpecialFormatTest extends ModelTestCase
+{
+
+  /** @test */
+  public function withFormatUnicodeShouldConvertAccentWithoutEmptyChar()
+  {
+    $chaine = file_get_contents(realpath(__DIR__) . '/unicode_format.txt');
+    $this->assertEquals('MATERIALITE', (new Class_Indexation)->alphaMaj($chaine));
+  }
+}
diff --git a/tests/library/Class/Indexation/unicode_format.txt b/tests/library/Class/Indexation/unicode_format.txt
new file mode 100644
index 00000000000..f731ea884ad
--- /dev/null
+++ b/tests/library/Class/Indexation/unicode_format.txt
@@ -0,0 +1 @@
+matérialité
diff --git a/tests/library/Class/MoteurRechercheTest.php b/tests/library/Class/MoteurRechercheTest.php
index 514937445fa..900995dba80 100644
--- a/tests/library/Class/MoteurRechercheTest.php
+++ b/tests/library/Class/MoteurRechercheTest.php
@@ -481,7 +481,7 @@ class MoteurRechercheSimpleWithOtherIndexFieldsTest extends MoteurRechercheSimpl
    *
    * @test
    */
-  public function lancerRechercheSimpleShouldBe(array $params, string $sql)
+  public function lancerRechercheSimpleBakounineShouldBe(array $params, string $sql)
   {
     $this->mockReqProfilAndZone($params);
 
@@ -1082,7 +1082,7 @@ class MoteurRechercheCountWordsTest extends MoteurRechercheTestCase
    *
    * @test
    */
-  public function lancerRechercheSimpleShouldBe(array $params, int $count_words)
+  public function lancerRechercheSimpleCountWordsShouldBe(array $params, int $count_words)
   {
     $this->mockReqProfilAndZone($params);
 
-- 
GitLab