'], [' <', '> '], $text)); $text = Html::decodeEntities($text); $text_length = strlen($text); // Make a list of unique keywords that are actually found in the text, // which could be items in $keys or replacements that are equivalent through // \Drupal\search\SearchTextProcessorInterface::analyze(). $temp_keys = []; foreach ($keys as $key) { $key = _search_find_match_with_simplify($key, $text, $boundary_character, $langcode); if (isset($key)) { // Quote slashes so they can be used in regular expressions. $temp_keys[] = preg_quote($key, '/'); } } // Several keywords could have simplified down to the same thing, so pick // out the unique ones. $keys = array_unique($temp_keys); // Extract fragments of about 60 characters around keywords, bounded by word // boundary characters. Try to reach 256 characters, using second occurrences // if necessary. $ranges = []; $length = 0; $look_start = []; $remaining_keys = $keys; while ($length < 256 && !empty($remaining_keys)) { $found_keys = []; foreach ($remaining_keys as $key) { if ($length >= 256) { break; } // Remember where we last found $key, in case we are coming through a // second time. if (!isset($look_start[$key])) { $look_start[$key] = 0; } // See if we can find $key after where we found it the last time. Since // we are requiring a match on a word boundary, make sure $text starts // and ends with a space. $matches = []; if (preg_match('/' . $preceded_by_boundary . $key . $followed_by_boundary . '/iu', ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) { $found_position = $matches[0][1]; $look_start[$key] = $found_position + 1; // Keep track of which keys we found this time, in case we need to // pass through again to find more text. $found_keys[] = $key; // Locate a space before and after this match, leaving about 60 // characters of context on each end. $before = strpos(' ' . $text, ' ', max(0, $found_position - 61)); if ($before !== FALSE && $before <= $found_position) { if ($text_length > $found_position + 60) { $after = strrpos(substr($text, 0, $found_position + 60), ' ', $found_position); } else { $after = $text_length; } if ($after !== FALSE && $after > $found_position) { // Account for the spaces we added. $before = max($before - 1, 0); if ($before < $after) { // Save this range. $ranges[$before] = $after; $length += $after - $before; } } } } } // Next time through this loop, only look for keys we found this time, // if any. $remaining_keys = $found_keys; } if (empty($ranges)) { // We didn't find any keyword matches, so just return the first part of the // text. We also need to re-encode any HTML special characters that we // entity-decoded above. return [ '#plain_text' => Unicode::truncate($text, 256, TRUE, TRUE), ]; } // Sort the text ranges by starting position. ksort($ranges); // Collapse overlapping text ranges into one. The sorting makes it O(n). $new_ranges = []; $max_end = 0; foreach ($ranges as $this_from => $this_to) { $max_end = max($max_end, $this_to); if (!isset($working_from)) { // This is the first time through this loop: initialize. $working_from = $this_from; $working_to = $this_to; continue; } if ($this_from <= $working_to) { // The ranges overlap: combine them. $working_to = max($working_to, $this_to); } else { // The ranges do not overlap: save the working range and start a new one. $new_ranges[$working_from] = $working_to; $working_from = $this_from; $working_to = $this_to; } } // Save the remaining working range. $new_ranges[$working_from] = $working_to; // Fetch text within the combined ranges we found. $out = []; foreach ($new_ranges as $from => $to) { $out[] = substr($text, $from, $to - $from); } // Combine the text chunks with "…" separators. The "…" needs to be // translated. Let translators have the … separator text as one chunk. $ellipses = explode('@excerpt', t('… @excerpt … @excerpt …')); $text = (isset($new_ranges[0]) ? '' : $ellipses[0]) . implode($ellipses[1], $out) . (($max_end < strlen($text) - 1) ? $ellipses[2] : ''); $text = Html::escape($text); // Highlight keywords. Must be done at once to prevent conflicts ('strong' // and ''). $text = trim(preg_replace('/' . $preceded_by_boundary . '(?:' . implode('|', $keys) . ')' . $followed_by_boundary . '/iu', '\0', ' ' . $text . ' ')); return [ '#markup' => $text, '#allowed_tags' => ['strong'], ]; } /** * @} End of "defgroup search". */ /** * Finds an appropriate keyword in text. * * @param string $key * The keyword to find. * @param string $text * The text to search for the keyword. * @param string $boundary * Regular expression for the boundary character class (characters that * indicate spaces between words). * @param string|null $langcode * Language code for the language of $text, if known. * * @return string|null * A segment of $text that is between word boundary characters that either * matches $key directly, or matches $key when both this text segment and * $key are processed by * \Drupal\search\SearchTextProcessorInterface::analyze(). If a matching text * segment is not located, NULL is returned. */ function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) { $preceded_by_boundary = '(?<=' . $boundary . ')'; $followed_by_boundary = '(?=' . $boundary . ')'; // See if $key appears as-is. When testing, make sure $text starts/ends with // a space, because we require $key to be surrounded by word boundary // characters. $temp = trim($key); if ($temp == '') { return NULL; } if (preg_match('/' . $preceded_by_boundary . preg_quote($temp, '/') . $followed_by_boundary . '/iu', ' ' . $text . ' ')) { return $temp; } // See if there is a match after lower-casing and removing diacritics in // both, which should preserve the string length. $new_text = mb_strtolower($text); $new_text = \Drupal::service('transliteration')->removeDiacritics($new_text); $new_key = mb_strtolower($temp); $new_key = \Drupal::service('transliteration')->removeDiacritics($new_key); if (preg_match('/' . $preceded_by_boundary . preg_quote($new_key, '/') . $followed_by_boundary . '/u', ' ' . $new_text . ' ')) { $position = mb_strpos($new_text, $new_key); return mb_substr($text, $position, mb_strlen($new_key)); } // Run both text and key through text processor. /** @var \Drupal\search\SearchTextProcessorInterface $text_processor */ $text_processor = \Drupal::service('search.text_processor'); $simplified_key = trim($text_processor->analyze($key, $langcode)); $simplified_text = trim($text_processor->analyze($text, $langcode)); if ($simplified_key == '' || $simplified_text == '' || !str_contains($simplified_text, $simplified_key)) { // The simplified keyword and text do not match at all, or are empty. return NULL; } // Split $text into words, keeping track of where the word boundaries are. $words = preg_split('/' . $boundary . '+/u', $text, -1, PREG_SPLIT_OFFSET_CAPTURE); // Add an entry pointing to the end of the string, for the loop below. $words[] = ['', strlen($text)]; // Using a binary search, find the earliest possible ending position in // $text where it will still match the keyword after applying // \Drupal\search\SearchTextProcessorInterface::analyze(). $start_index = 0; $start_pos = $words[$start_index][1]; $min_end_index = 1; $max_end_index = count($words) - 1; while ($max_end_index > $min_end_index) { // Check the index half way between min and max. See if we ended there, // if we would still have a match. $proposed_end_index = floor(($max_end_index + $min_end_index) / 2); $proposed_end_pos = $words[$proposed_end_index][1]; // Since the split was done with preg_split(), the positions are byte counts // not character counts, so use substr() not mb_substr() here. $trial_text = trim($text_processor->analyze(substr($text, $start_pos, $proposed_end_pos - $start_pos), $langcode)); if (str_contains($trial_text, $simplified_key)) { // The proposed endpoint is fine, text still matches. $max_end_index = $proposed_end_index; } else { // The proposed endpoint index is too early, so the earliest possible // OK ending point would be the next index. $min_end_index = $proposed_end_index + 1; } } // Now do the same for the starting position: using a binary search, find the // latest possible starting position in $text where it will still match the // keyword after applying // \Drupal\search\SearchTextProcessorInterface::analyze(). $end_index = $min_end_index; $end_pos = $words[$end_index][1]; $min_start_index = 0; $max_start_index = $end_index - 1; while ($max_start_index > $min_start_index) { // Check the index half way between min and max. See if we started there, // if we would still have a match. $proposed_start_index = ceil(($max_start_index + $min_start_index) / 2); $proposed_start_pos = $words[$proposed_start_index][1]; // Since the split was done with preg_split(), the positions are byte counts // not character counts, so use substr() not mb_substr() here. $trial_text = trim($text_processor->analyze(substr($text, $proposed_start_pos, $end_pos - $proposed_start_pos), $langcode)); if (str_contains($trial_text, $simplified_key)) { // The proposed start point is fine, text still matches. $min_start_index = $proposed_start_index; } else { // The proposed start point index is too late, so the latest possible // OK starting point would be the previous index. $max_start_index = $proposed_start_index - 1; } } $start_index = $max_start_index; // Return the matching text. We need to use substr() here and not the // mb_substr() function, because the indices in $words came from preg_split(), // so they are Unicode-safe byte positions, not character positions. return trim(substr($text, $words[$start_index][1], $words[$end_index][1] - $words[$start_index][1])); }