diff --git a/VERSIONS_WIP/162794 b/VERSIONS_WIP/162794 new file mode 100644 index 0000000000000000000000000000000000000000..4540bfbbacfd01577d71ecfc3e3b4c4b7dcdea1e --- /dev/null +++ b/VERSIONS_WIP/162794 @@ -0,0 +1 @@ + - fonctionnalité #162794 : Utilitaire : Script effectuant la séparation d'un fichier unimarc global en fichier par bibliothèque \ No newline at end of file diff --git a/library/Class/Cosmogramme/Integration/SplitBySite.php b/library/Class/Cosmogramme/Integration/SplitBySite.php new file mode 100644 index 0000000000000000000000000000000000000000..1872d652946b44fef02e56b18785039c3aeacf18 --- /dev/null +++ b/library/Class/Cosmogramme/Integration/SplitBySite.php @@ -0,0 +1,158 @@ +<?php +/** + * Copyright (c) 2012-2022, Agence Française Informatique (AFI). All rights reserved. + * + * BOKEH is free software; you can redistribute it and/or modify + * it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE as published by + * the Free Software Foundation. + * + * There are special exceptions to the terms and conditions of the AGPL as it + * is applied to this software (see README file). + * + * BOKEH is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU AFFERO GENERAL PUBLIC LICENSE for more details. + * + * You should have received a copy of the GNU AFFERO GENERAL PUBLIC LICENSE + * along with BOKEH; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +class Class_Cosmogramme_Integration_SplitBySite { + use Trait_StormFileSystem; + use Trait_Logger; + use Trait_EchoError; + + const ITEM_SUBFIELD_TO_SPLIT_ON = 'h'; + + protected string $_dest_file_name; + protected string $_dest_file_extension; + protected string $_dest_dir; + protected string $_split_on_subfield; + protected array $_records_by_site = []; + protected array $_items_by_site = []; + + public function split(string $file_path_to_split, + string $dest_dir, + string $split_on_subfield = self::ITEM_SUBFIELD_TO_SPLIT_ON) : self { + + require_once('library/Class/Cosmogramme/FileParser.php'); + + $this->_logTitle('=', '**** Rapport Préparation fichier Import', 2); + + $parts = pathinfo($file_path_to_split); + $this->_dest_dir = $dest_dir; + $this->_dest_file_name = $parts['filename']; + $this->_dest_file_extension = $parts['extension']; + $this->_split_on_subfield = $split_on_subfield; + + $marc_file_parser = new Class_Cosmogramme_FileParser_Marc($file_path_to_split, 0, null); + $count_processed =0; + $this->_records_by_site =[]; + $this->_items_by_site =['total' => 0]; + + while (($unimarc = $marc_file_parser->next()) && !$unimarc->isEnd()){ + $this->_splitUnimarc($unimarc); + $count_processed++; + if ($count_processed%1000==0) + $this->getLogger()->log(sprintf('%d notices traitées', $count_processed)); + } + + $this->_logHash($this->_records_by_site, 'notices'); + $this->_logHash($this->_items_by_site, 'exemplaires'); + + $this->getLogger()->log(sprintf(' %d notices traitées au total', $count_processed)); + return $this; + } + + + protected function _logTitle(string $sep_char, string $title, string $level) : self { + $this->getLogger()->log('\n'); + $this->getLogger()->log(str_repeat($sep_char,50)); + $this->getLogger()->log($title); + $this->getLogger()->log(str_repeat($sep_char,50)); + $this->getLogger()->log(str_repeat('\n', $level)); + return $this; + } + + + protected function _logHash(array $hash, string $value_label) :self { + $this->_logTitle('-', sprintf('\t\t Statistiques %s', $value_label), 1); + + foreach ($hash as $id_site => $count) + $this->getLogger()->log(sprintf(' site %s : %4d %s', $id_site,$count, $value_label)); + return $this; + } + + + protected function _splitUnimarc(Class_Cosmogramme_FileParser_Record $unimarc) : self { + + $writer = new Class_NoticeUnimarc_Writer(); + $unimarc->withDataDo(fn($data) => $writer->setNotice($data)); + + $items = $writer->get_subfield('995'); + + $all_sites_in_record = array_unique($writer->get_subfield('995', + $this->_split_on_subfield)); + + $writer->delete_items(); + + $this->_items_by_site['total'] += sizeof($items); + + foreach ($all_sites_in_record as $id_site) { + $this->_writeSiteRecordToFile($writer, + $id_site, + $items); + + if (!isset($this->_records_by_site[$id_site])) + $this->_records_by_site[$id_site] = 0; + + $this->_records_by_site[$id_site]++; + } + + return $this; + } + + + protected function _writeSiteRecordToFile( Class_NoticeUnimarc_Writer $writer, + string $id_site, + array $items) : self { + + $zone995s = array_filter($items, + fn($zone) => preg_match('/'. chr(0x1f) . $this->_split_on_subfield . $id_site . '\b/', $zone )); + + $new_record = clone $writer; + array_map(fn($zone995) => $new_record->add_zone('995', $zone995), + $zone995s + ); + + if (!isset($this->_items_by_site [$id_site])) + $this->_items_by_site[$id_site] = 0; + + $this->_items_by_site [$id_site] += sizeof($zone995s); + + $new_record->update(); + $this->_siteFileAppend($id_site, $new_record->getFullRecord()); + return $this; + } + + + protected function _siteFileAppend(string $id_site, string $content) : self { + $file_path = sprintf('%s/site%s/%s.%s', + $this->_dest_dir, + $id_site, + $this->_dest_file_name, + $this->_dest_file_extension); + + if (!$this->getFileSystem()->fileExists(sprintf('%s/site%s', $this->_dest_dir, $id_site))) + $this->getFileSystem()->mkdir(sprintf('%s/site%s', $this->_dest_dir, $id_site)); + + $this + ->getFilesystem() + ->fileAppendContents($file_path, + $content); + return $this; + } +} diff --git a/scripts/split_unimarc_by_site.php b/scripts/split_unimarc_by_site.php new file mode 100644 index 0000000000000000000000000000000000000000..b2e6f9e9ecc09ed638d3bdcd9958be87a4dfd132 --- /dev/null +++ b/scripts/split_unimarc_by_site.php @@ -0,0 +1,27 @@ +<?php +require(__DIR__.'/../console.php'); +require_once(__DIR__.'/../cosmogramme/php/_init.php'); +require_once(__DIR__.'/../cosmogramme/php/classes/classe_log.php'); + + +$filename = $argv[1]; +$subfield = $argv[2]; +$dest_dir = $argv[3]; + +if (sizeof($argv)!=4) + print <<<EOF +split_unimarc_by_site.php splits an iso2709 unimarc file site into files relying on 995 subfield. +It stores records with related items depending on the subfield parameter. + +Usage : php split_unimarc_by_site.php <file_to_split_path> <subfield_tag_to_split_file_on> <destination_directory> +EOF; + +$log = new log('integration'); +$log_debug = new log('debug', false); +$log->setDebugLog($log_debug); +$log->open(true); +$log_debug->open(true); + +(new Class_Cosmogramme_Integration_SplitBySite) + ->setLogger($log) + ->split($filename, $dest_dir, $subfield); diff --git a/tests/library/Class/Cosmogramme/Integration/PhasePrepareIntegrationsTest.php b/tests/library/Class/Cosmogramme/Integration/PhasePrepareIntegrationsTest.php index 9f19a2fdf10d2bed3500ad2e77806d73aafbf94c..dfae859097a619f7dfa4f27c8ef340a8a781091e 100644 --- a/tests/library/Class/Cosmogramme/Integration/PhasePrepareIntegrationsTest.php +++ b/tests/library/Class/Cosmogramme/Integration/PhasePrepareIntegrationsTest.php @@ -35,20 +35,28 @@ abstract class PhasePrepareIntegrationsWithOAITestCase ->mkdir('ftp/my-library.net/transferts/foo') ->cd('ftp/my-library.net/transferts/foo') - ->touch('20180517mylibraryincr.txt', - fn($entry) => $entry->setMTime(strtotime('2018-05-17'))->putContents('incr 20180517')) + ->filePutContents('20180517mylibraryincr.txt', 'incr 20180517') + ->filePutContents('20190118mylibraryincr.txt', 'incr 20190118') + ->filePutContents('mylibrarytotal.txt', 'my library total content') + ->filePutContents('toosmall.txt', 'small') + ->cd('/'); + + $this->_file_system->entryAt('ftp/my-library.net/transferts/foo/20180517mylibraryincr.txt') + ->setMTime(strtotime('2018-05-17')); + $this->_file_system + ->touch('mylibrarytotal.txt', + fn($entry) => $entry->putContents('my library total content')->setSize(11000000)); - ->touch('20190118mylibraryincr.txt', - fn($entry) => $entry->setMTime(strtotime('2019-01-18'))->putContents('incr 20190118')) + $this->_file_system->entryAt('ftp/my-library.net/transferts/foo/20190118mylibraryincr.txt') + ->setMTime(strtotime('2019-01-18')); - ->touch('mylibrarytotal.txt', - fn($entry) => $entry->putContents('my library total content')->setSize(11000000)) + $this->_file_system->entryAt('ftp/my-library.net/transferts/foo/mylibrarytotal.txt') + ->setSize(11000000); + + $this->_file_system->entryAt('ftp/my-library.net/transferts/foo/toosmall.txt') + ->setSize(1000000); - ->touch('toosmall.txt', - fn($entry) => $entry->setSize(1000000)) - ->cd('/') - ; Class_Cosmogramme_Integration_WaitingFiles::setFileSystem($this->_file_system); Class_Cosmogramme_Integration_WaitingFile::setFileSystem($this->_file_system); diff --git a/tests/library/Class/Cosmogramme/Integration/SplitMarcPerSiteTest.php b/tests/library/Class/Cosmogramme/Integration/SplitMarcPerSiteTest.php new file mode 100644 index 0000000000000000000000000000000000000000..ab6bad0565f7a7667d045baa7b223181dd9bad9d --- /dev/null +++ b/tests/library/Class/Cosmogramme/Integration/SplitMarcPerSiteTest.php @@ -0,0 +1,142 @@ +<?php +/** + * Copyright (c) 2012-2022, Agence Française Informatique (AFI). All rights reserved. + * + * BOKEH is free software; you can redistribute it and/or modify + * it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE as published by + * the Free Software Foundation. + * + * There are special exceptions to the terms and conditions of the AGPL as it + * is applied to this software (see README file). + * + * BOKEH is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU AFFERO GENERAL PUBLIC LICENSE for more details. + * + * You should have received a copy of the GNU AFFERO GENERAL PUBLIC LICENSE + * along with BOKEH; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +class SplitMarcPerSiteTest extends ModelTestCase { + protected $_fs; + protected $_logger; + protected string $_log_content =''; + + public function setup() { + parent::setup(); + + $this->_logger = $this->mock(); + $append_log = function($content) { + $this->_log_content .= ' ' . $content . "\n"; + return $this->_logger; + }; + + $this->_logger + ->whenCalled('error')->willDo($append_log) + ->whenCalled('success')->willDo($append_log) + ->whenCalled('info')->willDo($append_log) + ->whenCalled('log')->willDo($append_log); + + $utils = (new Class_Cosmogramme_Integration_SplitBySite()); + $utils->setLogger($this->_logger); + + $this->_fs = new Storm_FileSystem_Volatile(); + $utils->setFilesystem($this->_fs); + $this->_fs->mkdir('/tmp'); + $utils->split(__DIR__.'/unimarc_to_split.uni','/tmp'); + } + + + public function getFilenames(){ + return [['/tmp/site3/unimarc_to_split.uni'], + ['/tmp/site12/unimarc_to_split.uni'], + ['/tmp/site30/unimarc_to_split.uni'], + ['/tmp/site33/unimarc_to_split.uni'], + ]; + } + + + /** @test + * @dataProvider getFilenames + */ + public function tmpDirShouldContainsFourFilesNamedUnimarcToSplit_x($filename) { + $this->assertTrue($this->_fs->fileExists($filename)); + } + + + /** @test */ + public function unimarcToSplitThreeShouldContainsTwoRecords() { + $unimarcs = preg_split('/'.chr(30).chr(29).'/', + $this->_fs->fileGetContents('/tmp/site3/unimarc_to_split.uni')); + $this->assertCount(2, + array_filter($unimarcs)); + return $unimarcs; + } + + + /** + * @depends unimarcToSplitThreeShouldContainsTwoRecords + * @test + */ + public function unimarcToSplitThreeFirstRecordShouldHaveOne995DollarHWithValueThree($unimarcs) { + $this->assertEquals(['3'], + (new Class_NoticeUnimarc()) + ->setNotice($unimarcs[0]) + ->get_subfield('995', 'h')); + } + + + + /** @test */ + public function unimarcToSplitTwelveShouldContainsOneRecord() { + $unimarcs = preg_split('/'.chr(30).chr(29).'/', + $this->_fs->fileGetContents('/tmp/site12/unimarc_to_split.uni')); + $this->assertCount(1, + array_filter($unimarcs)); + return $unimarcs[0]; + } + + + + /** + * @depends unimarcToSplitTwelveShouldContainsOneRecord + * @test + */ + public function unimarcToSplitTwelveFirstRecordShouldHaveTwo995FieldsWithHSubfieldTwelve($unimarc) { + $this->assertEquals(['12', '12'], + (new Class_NoticeUnimarc()) + ->setNotice($unimarc) + ->get_subfield('995', 'h')); + } + + + /** @test */ + public function logShouldContainsTwoRecordsParsed() { + $this->assertContains('2 notices traitées', + $this->_log_content); + } + + + /** @test */ + public function logShouldContainsSiteThreeOneRecord() { + $this->assertContains(' site 3 : 2 notices + site 12 : 1 notices + site 30 : 1 notices + site 33 : 1 notices', + $this->_log_content); + } + + + /** @test */ + public function logShouldContainsSiteThreeOneItems() { + $this->assertContains(' site total : 6 exemplaires + site 3 : 2 exemplaires + site 12 : 2 exemplaires + site 30 : 1 exemplaires + site 33 : 1 exemplaires', + $this->_log_content); + } +} diff --git a/tests/library/Class/Cosmogramme/Integration/unimarc_to_split.uni b/tests/library/Class/Cosmogramme/Integration/unimarc_to_split.uni new file mode 100644 index 0000000000000000000000000000000000000000..b2392c9338c05acd35254568014b254e70baa91c --- /dev/null +++ b/tests/library/Class/Cosmogramme/Integration/unimarc_to_split.uni @@ -0,0 +1 @@ +01360cam 2200277 450 001001500000010003500015020001700050021002700067100004100094101000800135102000700143105001800150106000600168200003600174210003200210215005400242345001800296676000800314700004500322801002700367801003900394930001600433995021500449995020900664995020900873frOr0039046168 a2-7441-6257-4brel.d13,95 EUR aFRb00330021 aFRbDLE-20030819-34011 a20030819d2003 m y0frey0103 ba0 afre aFR ay z 000aa ar1 aLes soeurs RobinfYves Viollier aPariscFrance loisirsd2003 a273 p.ccouv. ill., jaquette ill. en coul.d21 cm b9782744162572 a843 1311928505aViollierbYvesf1946-....4070 3aFR FrancebBM CAZAUBON 3aFRbBNFc20030819gAFNOR2intermrc 1a2003 19313300a00096000019164b39046168chd96fR VIOg3h3i3l1396100010o07/11/2018p18/12/2018q26/09/2018r07/11/2018t1396200047w1x2y2z1A705000024B1395C07/12/2012D961G5H5N2OyPhS96T3U2V96W2Z292100191600a00249000009228b39046168chd249fR VIOg12h12i12l43000134o01/07/2020p20/07/2020q20/06/2020r01/07/2020t1331400440w1x2y2z1B1395C23/01/2013D335G9H9N2OyPhS249T12U2V249W2Z293900092200a00249000006734b39012393chd249fR VIOg12h12i12l43000134o02/08/2021p02/08/2021q20/06/2020r01/07/2020t1331400440w1x2y2z1B1395C23/01/2013D335G9H9N2OyPhS249T12U2V249W2Z293900092201288cam 2200265 450 001001500000010003500015020001700050100004100067101001300108102000700121105001800128106000600146200007600152210003200228215004200260454002300302686000800325700004400333702004800377801004800425930001600473995020400489995013100693995019800824frOr0039045777 a2-7441-6360-0brel.d15,95 EUR aFRb00329925 a20030819d2003 m y0frey50 ba0 afreceng aFR a||||z 00|a| ar1 aLa nostalgie de l'angef C-c C-cAlice Seboldgtrad. d'Edith Soonckindt aPariscFrance loisirsd2003 a386 p.cjaquette ill. en coul.d21 cm 1t[The ]lovely bones a803 1314449938aSeboldbAlicef1964-....4070 1312136093aSoonckindtbEdithf1958-....4730 3aFRbFR-751131015c20030819gAFNOR2intermrc 1a2003-19228900a00096000124949b39045777chd96fR SEBg3h3i3l1396100362o06/07/2019p07/08/2019q21/04/2016r24/05/2016t1396100255w1x2y2z1B1595C09/01/2014D961G3H3N2OyPhS96T3U2V96W2Z292101249400a03204000016316b39045777chd3204fR SEBg30h30i30s4w1x2y2z1C30/04/2010D1F1N2OyS3204T30U2V3204W2Z296100163100a03206000006668b39045777chd3206fRAM SEBg33h33i33o06/02/2013p03/04/2013q01/08/2012r01/08/2012t700060066w1x2y2z1B1595C01/08/2012D1G3H3N0PhS3206T33U2V3206W2Z2975000666 \ No newline at end of file