| 1 | efrain | 1 | <?php
 | 
        
           |  |  | 2 |   | 
        
           |  |  | 3 | declare(strict_types=1);
 | 
        
           |  |  | 4 |   | 
        
           |  |  | 5 | namespace OpenSpout\Reader\XLSX\Manager\SharedStringsCaching;
 | 
        
           |  |  | 6 |   | 
        
           |  |  | 7 | /**
 | 
        
           |  |  | 8 |  * @internal
 | 
        
           |  |  | 9 |  */
 | 
        
           | 1441 | ariadna | 10 | final readonly class CachingStrategyFactory implements CachingStrategyFactoryInterface
 | 
        
           | 1 | efrain | 11 | {
 | 
        
           |  |  | 12 |     /**
 | 
        
           |  |  | 13 |      * The memory amount needed to store a string was obtained empirically from this data:.
 | 
        
           |  |  | 14 |      *
 | 
        
           |  |  | 15 |      *        ------------------------------------
 | 
        
           |  |  | 16 |      *        | Number of chars⁺ | Memory needed |
 | 
        
           |  |  | 17 |      *        ------------------------------------
 | 
        
           |  |  | 18 |      *        |           3,000  |         1 MB  |
 | 
        
           |  |  | 19 |      *        |          15,000  |         2 MB  |
 | 
        
           |  |  | 20 |      *        |          30,000  |         5 MB  |
 | 
        
           |  |  | 21 |      *        |          75,000  |        11 MB  |
 | 
        
           |  |  | 22 |      *        |         150,000  |        21 MB  |
 | 
        
           |  |  | 23 |      *        |         300,000  |        43 MB  |
 | 
        
           |  |  | 24 |      *        |         750,000  |       105 MB  |
 | 
        
           |  |  | 25 |      *        |       1,500,000  |       210 MB  |
 | 
        
           |  |  | 26 |      *        |       2,250,000  |       315 MB  |
 | 
        
           |  |  | 27 |      *        |       3,000,000  |       420 MB  |
 | 
        
           |  |  | 28 |      *        |       4,500,000  |       630 MB  |
 | 
        
           |  |  | 29 |      *        ------------------------------------
 | 
        
           |  |  | 30 |      *
 | 
        
           |  |  | 31 |      *        ⁺ All characters were 1 byte long
 | 
        
           |  |  | 32 |      *
 | 
        
           |  |  | 33 |      * This gives a linear graph where each 1-byte character requires about 150 bytes to be stored.
 | 
        
           |  |  | 34 |      * Given that some characters can take up to 4 bytes, we need 600 bytes per character to be safe.
 | 
        
           |  |  | 35 |      * Also, there is on average about 20 characters per cell (this is entirely empirical data...).
 | 
        
           |  |  | 36 |      *
 | 
        
           |  |  | 37 |      * This means that in order to store one shared string in memory, the memory amount needed is:
 | 
        
           |  |  | 38 |      *   => 20 * 600 ≈ 12KB
 | 
        
           |  |  | 39 |      */
 | 
        
           |  |  | 40 |     public const AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB = 12;
 | 
        
           |  |  | 41 |   | 
        
           |  |  | 42 |     /**
 | 
        
           |  |  | 43 |      * To avoid running out of memory when extracting a huge number of shared strings, they can be saved to temporary files
 | 
        
           |  |  | 44 |      * instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory
 | 
        
           |  |  | 45 |      * and the string will be quickly retrieved.
 | 
        
           |  |  | 46 |      * The performance bottleneck is not when creating these temporary files, but rather when loading their content.
 | 
        
           |  |  | 47 |      * Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works
 | 
        
           |  |  | 48 |      * best when the indexes of the shared strings are sorted in the sheet data.
 | 
        
           |  |  | 49 |      * 10,000 was chosen because it creates small files that are fast to be loaded in memory.
 | 
        
           |  |  | 50 |      */
 | 
        
           |  |  | 51 |     public const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000;
 | 
        
           |  |  | 52 |   | 
        
           | 1441 | ariadna | 53 |     private MemoryLimit $memoryLimit;
 | 
        
           | 1 | efrain | 54 |   | 
        
           |  |  | 55 |     public function __construct(MemoryLimit $memoryLimit)
 | 
        
           |  |  | 56 |     {
 | 
        
           |  |  | 57 |         $this->memoryLimit = $memoryLimit;
 | 
        
           |  |  | 58 |     }
 | 
        
           |  |  | 59 |   | 
        
           |  |  | 60 |     /**
 | 
        
           |  |  | 61 |      * Returns the best caching strategy, given the number of unique shared strings
 | 
        
           |  |  | 62 |      * and the amount of memory available.
 | 
        
           |  |  | 63 |      *
 | 
        
           |  |  | 64 |      * @param null|int $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
 | 
        
           |  |  | 65 |      * @param string   $tempFolder               Temporary folder where the temporary files to store shared strings will be stored
 | 
        
           |  |  | 66 |      *
 | 
        
           |  |  | 67 |      * @return CachingStrategyInterface The best caching strategy
 | 
        
           |  |  | 68 |      */
 | 
        
           |  |  | 69 |     public function createBestCachingStrategy(?int $sharedStringsUniqueCount, string $tempFolder): CachingStrategyInterface
 | 
        
           |  |  | 70 |     {
 | 
        
           |  |  | 71 |         if ($this->isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)) {
 | 
        
           |  |  | 72 |             return new InMemoryStrategy($sharedStringsUniqueCount);
 | 
        
           |  |  | 73 |         }
 | 
        
           |  |  | 74 |   | 
        
           |  |  | 75 |         return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE);
 | 
        
           |  |  | 76 |     }
 | 
        
           |  |  | 77 |   | 
        
           |  |  | 78 |     /**
 | 
        
           |  |  | 79 |      * Returns whether it is safe to use in-memory caching, given the number of unique shared strings
 | 
        
           |  |  | 80 |      * and the amount of memory available.
 | 
        
           |  |  | 81 |      *
 | 
        
           |  |  | 82 |      * @param null|int $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
 | 
        
           |  |  | 83 |      */
 | 
        
           |  |  | 84 |     private function isInMemoryStrategyUsageSafe(?int $sharedStringsUniqueCount): bool
 | 
        
           |  |  | 85 |     {
 | 
        
           |  |  | 86 |         // if the number of shared strings in unknown, do not use "in memory" strategy
 | 
        
           |  |  | 87 |         if (null === $sharedStringsUniqueCount) {
 | 
        
           |  |  | 88 |             return false;
 | 
        
           |  |  | 89 |         }
 | 
        
           |  |  | 90 |   | 
        
           |  |  | 91 |         $memoryAvailable = $this->memoryLimit->getMemoryLimitInKB();
 | 
        
           |  |  | 92 |   | 
        
           |  |  | 93 |         if (-1 === (int) $memoryAvailable) {
 | 
        
           |  |  | 94 |             // if cannot get memory limit or if memory limit set as unlimited, don't trust and play safe
 | 
        
           |  |  | 95 |             $isInMemoryStrategyUsageSafe = ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE);
 | 
        
           |  |  | 96 |         } else {
 | 
        
           |  |  | 97 |             $memoryNeeded = $sharedStringsUniqueCount * self::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB;
 | 
        
           |  |  | 98 |             $isInMemoryStrategyUsageSafe = ($memoryAvailable > $memoryNeeded);
 | 
        
           |  |  | 99 |         }
 | 
        
           |  |  | 100 |   | 
        
           |  |  | 101 |         return $isInMemoryStrategyUsageSafe;
 | 
        
           |  |  | 102 |     }
 | 
        
           |  |  | 103 | }
 |