348 lines
14 KiB
PHP
348 lines
14 KiB
PHP
|
|
<?php
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @file
|
||
|
|
*/
|
||
|
|
|
||
|
|
use Drupal\Component\Utility\Html;
|
||
|
|
use Drupal\Component\Utility\Unicode;
|
||
|
|
use Drupal\search\SearchTextProcessorInterface;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Implements hook_theme_suggestions_HOOK().
|
||
|
|
*/
|
||
|
|
function search_theme_suggestions_search_result(array $variables): array {
|
||
|
|
return ['search_result__' . $variables['plugin_id']];
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Implements hook_preprocess_HOOK() for block templates.
|
||
|
|
*/
|
||
|
|
function search_preprocess_block(&$variables): void {
|
||
|
|
if ($variables['plugin_id'] == 'search_form_block') {
|
||
|
|
$variables['attributes']['role'] = 'search';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @defgroup search Search interface
|
||
|
|
* @{
|
||
|
|
* The Drupal search interface manages a global search mechanism.
|
||
|
|
*
|
||
|
|
* Modules may plug into this system to provide searches of different types of
|
||
|
|
* data. Most of the system is handled by the Search module, so this must be
|
||
|
|
* enabled for all of the search features to work.
|
||
|
|
*
|
||
|
|
* There are two ways to interact with the search system:
|
||
|
|
* - Specifically for searching nodes, you can implement
|
||
|
|
* hook_node_update_index() and hook_node_search_result(). However, note that
|
||
|
|
* the search system already indexes all visible output of a node; i.e.,
|
||
|
|
* everything displayed normally during node viewing. This is
|
||
|
|
* usually sufficient. You should only use this mechanism if you want
|
||
|
|
* additional, non-visible data to be indexed.
|
||
|
|
* - Define a plugin implementing \Drupal\search\Plugin\SearchInterface and
|
||
|
|
* annotated as \Drupal\search\Annotation\SearchPlugin. This will create a
|
||
|
|
* search page type that users can use to set up one or more search pages.
|
||
|
|
* Each of these corresponds to a tab on the /search page, which can be
|
||
|
|
* used to perform searches. You will also need to implement the execute()
|
||
|
|
* method from the interface to perform the search. A base class is provided
|
||
|
|
* in \Drupal\search\Plugin\SearchPluginBase. For more information about
|
||
|
|
* plugins, see the @link plugin_api Plugin API topic. @endlink
|
||
|
|
*
|
||
|
|
* If your module needs to provide a more complicated search form, then you need
|
||
|
|
* to implement it yourself. In that case, you may wish to define it as a local
|
||
|
|
* task (tab) under the /search page (e.g. /search/my_module) so that users can
|
||
|
|
* easily find it.
|
||
|
|
*
|
||
|
|
* @see plugin_api
|
||
|
|
* @see annotation
|
||
|
|
*/
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Returns snippets from a piece of text, with search keywords highlighted.
|
||
|
|
*
|
||
|
|
* Used for formatting search results. All HTML tags will be stripped from
|
||
|
|
* $text.
|
||
|
|
*
|
||
|
|
* @param string $keys
|
||
|
|
* A string containing a search query.
|
||
|
|
* @param string $text
|
||
|
|
* The text to extract fragments from.
|
||
|
|
* @param string|null $langcode
|
||
|
|
* Language code for the language of $text, if known.
|
||
|
|
*
|
||
|
|
* @return array
|
||
|
|
* A render array containing HTML for the excerpt.
|
||
|
|
*/
|
||
|
|
function search_excerpt($keys, $text, $langcode = NULL): array {
|
||
|
|
// We highlight around non-indexable or CJK characters.
|
||
|
|
$boundary_character = '[' . Unicode::PREG_CLASS_WORD_BOUNDARY . SearchTextProcessorInterface::PREG_CLASS_CJK . ']';
|
||
|
|
$preceded_by_boundary = '(?<=' . $boundary_character . ')';
|
||
|
|
$followed_by_boundary = '(?=' . $boundary_character . ')';
|
||
|
|
|
||
|
|
// Extract positive keywords and phrases.
|
||
|
|
preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches);
|
||
|
|
$keys = array_merge($matches[2], $matches[3]);
|
||
|
|
|
||
|
|
// Prepare text by stripping HTML tags and decoding HTML entities.
|
||
|
|
$text = strip_tags(str_replace(['<', '>'], [' <', '> '], $text));
|
||
|
|
$text = Html::decodeEntities($text);
|
||
|
|
$text_length = strlen($text);
|
||
|
|
|
||
|
|
// Make a list of unique keywords that are actually found in the text,
|
||
|
|
// which could be items in $keys or replacements that are equivalent through
|
||
|
|
// \Drupal\search\SearchTextProcessorInterface::analyze().
|
||
|
|
$temp_keys = [];
|
||
|
|
foreach ($keys as $key) {
|
||
|
|
$key = _search_find_match_with_simplify($key, $text, $boundary_character, $langcode);
|
||
|
|
if (isset($key)) {
|
||
|
|
// Quote slashes so they can be used in regular expressions.
|
||
|
|
$temp_keys[] = preg_quote($key, '/');
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Several keywords could have simplified down to the same thing, so pick
|
||
|
|
// out the unique ones.
|
||
|
|
$keys = array_unique($temp_keys);
|
||
|
|
|
||
|
|
// Extract fragments of about 60 characters around keywords, bounded by word
|
||
|
|
// boundary characters. Try to reach 256 characters, using second occurrences
|
||
|
|
// if necessary.
|
||
|
|
$ranges = [];
|
||
|
|
$length = 0;
|
||
|
|
$look_start = [];
|
||
|
|
$remaining_keys = $keys;
|
||
|
|
|
||
|
|
while ($length < 256 && !empty($remaining_keys)) {
|
||
|
|
$found_keys = [];
|
||
|
|
foreach ($remaining_keys as $key) {
|
||
|
|
if ($length >= 256) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Remember where we last found $key, in case we are coming through a
|
||
|
|
// second time.
|
||
|
|
if (!isset($look_start[$key])) {
|
||
|
|
$look_start[$key] = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// See if we can find $key after where we found it the last time. Since
|
||
|
|
// we are requiring a match on a word boundary, make sure $text starts
|
||
|
|
// and ends with a space.
|
||
|
|
$matches = [];
|
||
|
|
if (preg_match('/' . $preceded_by_boundary . $key . $followed_by_boundary . '/iu', ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) {
|
||
|
|
$found_position = $matches[0][1];
|
||
|
|
$look_start[$key] = $found_position + 1;
|
||
|
|
// Keep track of which keys we found this time, in case we need to
|
||
|
|
// pass through again to find more text.
|
||
|
|
$found_keys[] = $key;
|
||
|
|
|
||
|
|
// Locate a space before and after this match, leaving about 60
|
||
|
|
// characters of context on each end.
|
||
|
|
$before = strpos(' ' . $text, ' ', max(0, $found_position - 61));
|
||
|
|
if ($before !== FALSE && $before <= $found_position) {
|
||
|
|
if ($text_length > $found_position + 60) {
|
||
|
|
$after = strrpos(substr($text, 0, $found_position + 60), ' ', $found_position);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$after = $text_length;
|
||
|
|
}
|
||
|
|
if ($after !== FALSE && $after > $found_position) {
|
||
|
|
// Account for the spaces we added.
|
||
|
|
$before = max($before - 1, 0);
|
||
|
|
if ($before < $after) {
|
||
|
|
// Save this range.
|
||
|
|
$ranges[$before] = $after;
|
||
|
|
$length += $after - $before;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Next time through this loop, only look for keys we found this time,
|
||
|
|
// if any.
|
||
|
|
$remaining_keys = $found_keys;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (empty($ranges)) {
|
||
|
|
// We didn't find any keyword matches, so just return the first part of the
|
||
|
|
// text. We also need to re-encode any HTML special characters that we
|
||
|
|
// entity-decoded above.
|
||
|
|
return [
|
||
|
|
'#plain_text' => Unicode::truncate($text, 256, TRUE, TRUE),
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
// Sort the text ranges by starting position.
|
||
|
|
ksort($ranges);
|
||
|
|
|
||
|
|
// Collapse overlapping text ranges into one. The sorting makes it O(n).
|
||
|
|
$new_ranges = [];
|
||
|
|
$max_end = 0;
|
||
|
|
foreach ($ranges as $this_from => $this_to) {
|
||
|
|
$max_end = max($max_end, $this_to);
|
||
|
|
if (!isset($working_from)) {
|
||
|
|
// This is the first time through this loop: initialize.
|
||
|
|
$working_from = $this_from;
|
||
|
|
$working_to = $this_to;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if ($this_from <= $working_to) {
|
||
|
|
// The ranges overlap: combine them.
|
||
|
|
$working_to = max($working_to, $this_to);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
// The ranges do not overlap: save the working range and start a new one.
|
||
|
|
$new_ranges[$working_from] = $working_to;
|
||
|
|
$working_from = $this_from;
|
||
|
|
$working_to = $this_to;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Save the remaining working range.
|
||
|
|
$new_ranges[$working_from] = $working_to;
|
||
|
|
|
||
|
|
// Fetch text within the combined ranges we found.
|
||
|
|
$out = [];
|
||
|
|
foreach ($new_ranges as $from => $to) {
|
||
|
|
$out[] = substr($text, $from, $to - $from);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Combine the text chunks with "…" separators. The "…" needs to be
|
||
|
|
// translated. Let translators have the … separator text as one chunk.
|
||
|
|
$ellipses = explode('@excerpt', t('… @excerpt … @excerpt …'));
|
||
|
|
$text = (isset($new_ranges[0]) ? '' : $ellipses[0]) . implode($ellipses[1], $out) . (($max_end < strlen($text) - 1) ? $ellipses[2] : '');
|
||
|
|
$text = Html::escape($text);
|
||
|
|
|
||
|
|
// Highlight keywords. Must be done at once to prevent conflicts ('strong'
|
||
|
|
// and '<strong>').
|
||
|
|
$text = trim(preg_replace('/' . $preceded_by_boundary . '(?:' . implode('|', $keys) . ')' . $followed_by_boundary . '/iu', '<strong>\0</strong>', ' ' . $text . ' '));
|
||
|
|
return [
|
||
|
|
'#markup' => $text,
|
||
|
|
'#allowed_tags' => ['strong'],
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @} End of "defgroup search".
|
||
|
|
*/
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Finds an appropriate keyword in text.
|
||
|
|
*
|
||
|
|
* @param string $key
|
||
|
|
* The keyword to find.
|
||
|
|
* @param string $text
|
||
|
|
* The text to search for the keyword.
|
||
|
|
* @param string $boundary
|
||
|
|
* Regular expression for the boundary character class (characters that
|
||
|
|
* indicate spaces between words).
|
||
|
|
* @param string|null $langcode
|
||
|
|
* Language code for the language of $text, if known.
|
||
|
|
*
|
||
|
|
* @return string|null
|
||
|
|
* A segment of $text that is between word boundary characters that either
|
||
|
|
* matches $key directly, or matches $key when both this text segment and
|
||
|
|
* $key are processed by
|
||
|
|
* \Drupal\search\SearchTextProcessorInterface::analyze(). If a matching text
|
||
|
|
* segment is not located, NULL is returned.
|
||
|
|
*/
|
||
|
|
function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) {
|
||
|
|
$preceded_by_boundary = '(?<=' . $boundary . ')';
|
||
|
|
$followed_by_boundary = '(?=' . $boundary . ')';
|
||
|
|
|
||
|
|
// See if $key appears as-is. When testing, make sure $text starts/ends with
|
||
|
|
// a space, because we require $key to be surrounded by word boundary
|
||
|
|
// characters.
|
||
|
|
$temp = trim($key);
|
||
|
|
if ($temp == '') {
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
if (preg_match('/' . $preceded_by_boundary . preg_quote($temp, '/') . $followed_by_boundary . '/iu', ' ' . $text . ' ')) {
|
||
|
|
return $temp;
|
||
|
|
}
|
||
|
|
|
||
|
|
// See if there is a match after lower-casing and removing diacritics in
|
||
|
|
// both, which should preserve the string length.
|
||
|
|
$new_text = mb_strtolower($text);
|
||
|
|
$new_text = \Drupal::service('transliteration')->removeDiacritics($new_text);
|
||
|
|
$new_key = mb_strtolower($temp);
|
||
|
|
$new_key = \Drupal::service('transliteration')->removeDiacritics($new_key);
|
||
|
|
if (preg_match('/' . $preceded_by_boundary . preg_quote($new_key, '/') . $followed_by_boundary . '/u', ' ' . $new_text . ' ')) {
|
||
|
|
$position = mb_strpos($new_text, $new_key);
|
||
|
|
return mb_substr($text, $position, mb_strlen($new_key));
|
||
|
|
}
|
||
|
|
|
||
|
|
// Run both text and key through text processor.
|
||
|
|
/** @var \Drupal\search\SearchTextProcessorInterface $text_processor */
|
||
|
|
$text_processor = \Drupal::service('search.text_processor');
|
||
|
|
$simplified_key = trim($text_processor->analyze($key, $langcode));
|
||
|
|
$simplified_text = trim($text_processor->analyze($text, $langcode));
|
||
|
|
if ($simplified_key == '' || $simplified_text == '' || !str_contains($simplified_text, $simplified_key)) {
|
||
|
|
// The simplified keyword and text do not match at all, or are empty.
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Split $text into words, keeping track of where the word boundaries are.
|
||
|
|
$words = preg_split('/' . $boundary . '+/u', $text, -1, PREG_SPLIT_OFFSET_CAPTURE);
|
||
|
|
// Add an entry pointing to the end of the string, for the loop below.
|
||
|
|
$words[] = ['', strlen($text)];
|
||
|
|
|
||
|
|
// Using a binary search, find the earliest possible ending position in
|
||
|
|
// $text where it will still match the keyword after applying
|
||
|
|
// \Drupal\search\SearchTextProcessorInterface::analyze().
|
||
|
|
$start_index = 0;
|
||
|
|
$start_pos = $words[$start_index][1];
|
||
|
|
$min_end_index = 1;
|
||
|
|
$max_end_index = count($words) - 1;
|
||
|
|
while ($max_end_index > $min_end_index) {
|
||
|
|
// Check the index half way between min and max. See if we ended there,
|
||
|
|
// if we would still have a match.
|
||
|
|
$proposed_end_index = floor(($max_end_index + $min_end_index) / 2);
|
||
|
|
$proposed_end_pos = $words[$proposed_end_index][1];
|
||
|
|
// Since the split was done with preg_split(), the positions are byte counts
|
||
|
|
// not character counts, so use substr() not mb_substr() here.
|
||
|
|
$trial_text = trim($text_processor->analyze(substr($text, $start_pos, $proposed_end_pos - $start_pos), $langcode));
|
||
|
|
if (str_contains($trial_text, $simplified_key)) {
|
||
|
|
// The proposed endpoint is fine, text still matches.
|
||
|
|
$max_end_index = $proposed_end_index;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
// The proposed endpoint index is too early, so the earliest possible
|
||
|
|
// OK ending point would be the next index.
|
||
|
|
$min_end_index = $proposed_end_index + 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Now do the same for the starting position: using a binary search, find the
|
||
|
|
// latest possible starting position in $text where it will still match the
|
||
|
|
// keyword after applying
|
||
|
|
// \Drupal\search\SearchTextProcessorInterface::analyze().
|
||
|
|
$end_index = $min_end_index;
|
||
|
|
$end_pos = $words[$end_index][1];
|
||
|
|
$min_start_index = 0;
|
||
|
|
$max_start_index = $end_index - 1;
|
||
|
|
while ($max_start_index > $min_start_index) {
|
||
|
|
// Check the index half way between min and max. See if we started there,
|
||
|
|
// if we would still have a match.
|
||
|
|
$proposed_start_index = ceil(($max_start_index + $min_start_index) / 2);
|
||
|
|
$proposed_start_pos = $words[$proposed_start_index][1];
|
||
|
|
// Since the split was done with preg_split(), the positions are byte counts
|
||
|
|
// not character counts, so use substr() not mb_substr() here.
|
||
|
|
$trial_text = trim($text_processor->analyze(substr($text, $proposed_start_pos, $end_pos - $proposed_start_pos), $langcode));
|
||
|
|
if (str_contains($trial_text, $simplified_key)) {
|
||
|
|
// The proposed start point is fine, text still matches.
|
||
|
|
$min_start_index = $proposed_start_index;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
// The proposed start point index is too late, so the latest possible
|
||
|
|
// OK starting point would be the previous index.
|
||
|
|
$max_start_index = $proposed_start_index - 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
$start_index = $max_start_index;
|
||
|
|
|
||
|
|
// Return the matching text. We need to use substr() here and not the
|
||
|
|
// mb_substr() function, because the indices in $words came from preg_split(),
|
||
|
|
// so they are Unicode-safe byte positions, not character positions.
|
||
|
|
return trim(substr($text, $words[$start_index][1], $words[$end_index][1] - $words[$start_index][1]));
|
||
|
|
}
|