3 # FILE: SearchEngine.php 5 # Open Source Metadata Archive Search Engine (OSMASE) 6 # Copyright 2002-2016 Edward Almasy and Internet Scout Research Group 7 # http://scout.wisc.edu 16 # ---- PUBLIC INTERFACE -------------------------------------------------- 18 # possible types of logical operators 22 # flags used for indicating field types 28 # flags used for indicating word states 44 # create database object for our use 47 # save item access parameters 52 # set default debug state 66 public function AddField($FieldId, $FieldType, $ItemTypes,
67 $Weight, $UsedInKeywordSearch)
70 $this->FieldInfo[$FieldId][
"FieldType"] = $FieldType;
71 $this->FieldInfo[$FieldId][
"Weight"] = $Weight;
72 $this->FieldInfo[$FieldId][
"InKeywordSearch"] =
73 $UsedInKeywordSearch ? TRUE : FALSE;
74 $this->FieldInfo[$FieldId][
"ItemTypes"] = is_array($ItemTypes)
75 ? $ItemTypes : array($ItemTypes);
85 return $this->FieldInfo[$FieldId][
"FieldType"];
95 return $this->FieldInfo[$FieldId][
"Weight"];
105 return $this->FieldInfo[$FieldId][
"InKeywordSearch"];
118 # ---- search functions 139 $SearchParams, $StartingResult = 0, $NumberOfResults = PHP_INT_MAX,
140 $SortByField = NULL, $SortDescending = TRUE)
142 # if keyword search string was passed in 143 if (is_string($SearchParams))
145 # convert string to search parameter set 146 $SearchString = $SearchParams;
148 $SearchParams->AddParameter($SearchString);
151 # interpret and filter out magic debugging keyword (if any) 152 $KeywordStrings = $SearchParams->GetKeywordSearchStrings();
153 foreach ($KeywordStrings as $String)
155 $FilteredString = $this->ExtractDebugLevel($String);
156 if ($FilteredString != $String)
158 $SearchParams->RemoveParameter($String);
159 $SearchParams->AddParameter($FilteredString);
163 # save start time to use in calculating search time 164 $StartTime = microtime(TRUE);
166 # clear parsed search term list 167 $this->SearchTermList = array();
170 $Scores = $this->RawSearch($SearchParams);
172 # count, sort, and trim search result scores list 173 $Scores = $this->CleanScores($Scores, $StartingResult, $NumberOfResults,
174 $SortByField, $SortDescending);
177 $this->LastSearchTime = microtime(TRUE) - $StartTime;
179 # return search results to caller 180 $this->
DMsg(0,
"Ended up with ".$this->NumberOfResultsAvailable.
" results");
204 $SearchStrings, $StartingResult = 0, $NumberOfResults = 10,
205 $SortByField = NULL, $SortDescending = TRUE)
207 # pass off the request to grouped search (for now) if appropriate 210 return $this->GroupedSearch($SearchStrings, $StartingResult,
211 $NumberOfResults, $SortByField, $SortDescending);
214 # interpret and filter out magic debugging keyword (if any) 215 $SearchStrings = $this->SetDebugLevel($SearchStrings);
216 $this->
DMsg(0,
"In FieldedSearch() with " 217 .count($SearchStrings).
" search strings");
219 # save start time to use in calculating search time 220 $StartTime = microtime(TRUE);
223 $Scores = $this->SearchAcrossFields($SearchStrings);
224 $Scores = ($Scores === NULL) ? array() : $Scores;
226 # count, sort, and trim search result scores list 227 $Scores = $this->CleanScores($Scores, $StartingResult, $NumberOfResults,
228 $SortByField, $SortDescending);
231 $this->LastSearchTime = microtime(TRUE) - $StartTime;
233 # return list of items to caller 234 $this->
DMsg(0,
"Ended up with ".$this->NumberOfResultsAvailable.
" results");
244 # save filter function name 245 $this->FilterFuncs[] = $FunctionName;
256 return ($ItemType === NULL) ? $this->NumberOfResultsAvailable
257 : (isset($this->NumberOfResultsPerItemType[$ItemType])
258 ? $this->NumberOfResultsPerItemType[$ItemType] : 0);
267 return $this->SearchTermList;
288 $FieldIds = $SearchParams->GetFields();
289 foreach ($FieldIds as $FieldId)
291 if (array_key_exists($FieldId, $this->FieldInfo))
293 $Weight += $this->FieldInfo[$FieldId][
"Weight"];
296 if (count($SearchParams->GetKeywordSearchStrings()))
298 foreach ($this->FieldInfo as $FieldId => $Info)
300 if ($Info[
"InKeywordSearch"])
302 $Weight += $Info[
"Weight"];
310 # ---- search database update functions 319 # clear word count added flags for this item 320 unset($this->WordCountAdded);
322 # delete any existing info for this item 323 $this->DB->Query(
"DELETE FROM SearchWordCounts WHERE ItemId = ".$ItemId);
324 $this->DB->Query(
"DELETE FROM SearchItemTypes WHERE ItemId = ".$ItemId);
327 $this->DB->Query(
"INSERT INTO SearchItemTypes (ItemId, ItemType)" 328 .
" VALUES (".intval($ItemId).
", ".intval($ItemType).
")");
330 # for each metadata field 331 foreach ($this->FieldInfo as $FieldId => $Info)
333 # if valid search weight for field and field applies to this item 334 if (($Info[
"Weight"] > 0)
335 && in_array($ItemType, $Info[
"ItemTypes"]))
337 # retrieve text for field 343 # for each text string in array 344 foreach ($Text as $String)
346 # record search info for text 347 $this->RecordSearchInfoForText($ItemId, $FieldId,
348 $Info[
"Weight"], $String,
349 $Info[
"InKeywordSearch"]);
354 # record search info for text 355 $this->RecordSearchInfoForText($ItemId, $FieldId,
356 $Info[
"Weight"], $Text,
357 $Info[
"InKeywordSearch"]);
371 # retrieve IDs for specified number of items starting at specified ID 372 $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
", ".$this->ItemTypeFieldName
373 .
" FROM ".$this->ItemTableName
374 .
" WHERE ".$this->ItemIdFieldName.
" >= ".$StartingItemId
375 .
" ORDER BY ".$this->ItemIdFieldName.
" LIMIT ".$NumberOfItems);
376 $ItemIds = $this->DB->FetchColumn(
377 $this->ItemTypeFieldName, $this->ItemIdFieldName);
379 # for each retrieved item ID 380 foreach ($ItemIds as $ItemId => $ItemType)
382 # update search info for item 386 # return ID of last item updated to caller 396 # drop all entries pertaining to item from word count table 397 $this->DB->Query(
"DELETE FROM SearchWordCounts WHERE ItemId = ".$ItemId);
398 $this->DB->Query(
"DELETE FROM SearchItemTypes WHERE ItemId = ".$ItemId);
407 # drop all entries pertaining to field from word counts table 408 $this->DB->Query(
"DELETE FROM SearchWordCounts WHERE FieldId = \'".$FieldId.
"\'");
417 return $this->DB->Query(
"SELECT COUNT(*) AS TermCount" 418 .
" FROM SearchWords",
"TermCount");
427 return $this->DB->Query(
"SELECT COUNT(DISTINCT ItemId) AS ItemCount" 428 .
" FROM SearchWordCounts",
"ItemCount");
440 # asssume no synonyms will be added 444 $WordId = $this->GetWordId($Word, TRUE);
446 # for each synonym passed in 447 foreach ($Synonyms as $Synonym)
450 $SynonymId = $this->GetWordId($Synonym, TRUE);
452 # if synonym is not already in database 453 $this->DB->Query(
"SELECT * FROM SearchWordSynonyms" 454 .
" WHERE (WordIdA = ".$WordId
455 .
" AND WordIdB = ".$SynonymId.
")" 456 .
" OR (WordIdB = ".$WordId
457 .
" AND WordIdA = ".$SynonymId.
")");
458 if ($this->DB->NumRowsSelected() == 0)
460 # add synonym entry to database 461 $this->DB->Query(
"INSERT INTO SearchWordSynonyms" 462 .
" (WordIdA, WordIdB)" 463 .
" VALUES (".$WordId.
", ".$SynonymId.
")");
468 # report to caller number of new synonyms added 481 $WordId = $this->GetWordId($Word);
484 if ($WordId !== NULL)
486 # if no specific synonyms provided 487 if ($Synonyms === NULL)
489 # remove all synonyms for word 490 $this->DB->Query(
"DELETE FROM SearchWordSynonyms" 491 .
" WHERE WordIdA = '".$WordId.
"'" 492 .
" OR WordIdB = '".$WordId.
"'");
496 # for each specified synonym 497 foreach ($Synonyms as $Synonym)
499 # look up ID for synonym 500 $SynonymId = $this->GetWordId($Synonym);
502 # if synonym ID was found 503 if ($SynonymId !== NULL)
505 # delete synonym entry 506 $this->DB->Query(
"DELETE FROM SearchWordSynonyms" 507 .
" WHERE (WordIdA = '".$WordId.
"'" 508 .
" AND WordIdB = '".$SynonymId.
"')" 509 .
" OR (WordIdB = '".$WordId.
"'" 510 .
" AND WordIdA = '".$SynonymId.
"')");
522 $this->DB->Query(
"DELETE FROM SearchWordSynonyms");
532 # assume no synonyms will be found 535 # look up ID for word 536 $WordId = $this->GetWordId($Word);
538 # if word ID was found 539 if ($WordId !== NULL)
541 # look up IDs of all synonyms for this word 542 $this->DB->Query(
"SELECT WordIdA, WordIdB FROM SearchWordSynonyms" 543 .
" WHERE WordIdA = ".$WordId
544 .
" OR WordIdB = ".$WordId);
545 $SynonymIds = array();
546 while ($Record = $this->DB->FetchRow)
548 $SynonymIds[] = ($Record[
"WordIdA"] == $WordId)
549 ? $Record[
"WordIdB"] : $Record[
"WordIdA"];
552 # for each synonym ID 553 foreach ($SynonymIds as $SynonymId)
555 # look up synonym word and add to synonym list 556 $Synonyms[] = $this->GetWord($SynonymId);
560 # return synonyms to caller 570 # assume no synonyms will be found 571 $SynonymList = array();
573 # for each synonym ID pair 575 $OurDB->Query(
"SELECT WordIdA, WordIdB FROM SearchWordSynonyms");
576 while ($Record = $OurDB->FetchRow())
579 $Word = $this->GetWord($Record[
"WordIdA"]);
580 $Synonym = $this->GetWord($Record[
"WordIdB"]);
582 # if we do not already have an entry for the word 583 # or synonym is not listed for this word 584 if (!isset($SynonymList[$Word])
585 || !in_array($Synonym, $SynonymList[$Word]))
587 # add entry for synonym 588 $SynonymList[$Word][] = $Synonym;
591 # if we do not already have an entry for the synonym 592 # or word is not listed for this synonym 593 if (!isset($SynonymList[$Synonym])
594 || !in_array($Word, $SynonymList[$Synonym]))
597 $SynonymList[$Synonym][] = $Word;
602 # (this loop removes reciprocal duplicates) 603 foreach ($SynonymList as $Word => $Synonyms)
605 # for each synonym for that word 606 foreach ($Synonyms as $Synonym)
608 # if synonym has synonyms and word is one of them 609 if (isset($SynonymList[$Synonym])
610 && isset($SynonymList[$Word])
611 && in_array($Word, $SynonymList[$Synonym])
612 && in_array($Synonym, $SynonymList[$Word]))
614 # if word has less synonyms than synonym 615 if (count($SynonymList[$Word])
616 < count($SynonymList[$Synonym]))
618 # remove synonym from synonym list for word 619 $SynonymList[$Word] = array_diff(
620 $SynonymList[$Word], array($Synonym));
622 # if no synonyms left for word 623 if (!count($SynonymList[$Word]))
625 # remove empty synonym list for word 626 unset($SynonymList[$Word]);
631 # remove word from synonym list for synonym 632 $SynonymList[$Synonym] = array_diff(
633 $SynonymList[$Synonym], array($Word));
635 # if no synonyms left for word 636 if (!count($SynonymList[$Synonym]))
638 # remove empty synonym list for word 639 unset($SynonymList[$Synonym]);
646 # sort array alphabetically (just for convenience) 647 foreach ($SynonymList as $Word => $Synonyms)
649 asort($SynonymList[$Word]);
653 # return 2D array of synonyms to caller 664 # remove all existing synonyms 667 # for each synonym entry passed in 668 foreach ($SynonymList as $Word => $Synonyms)
670 # add synonyms for word 685 # asssume no synonyms will be added 688 # read in contents of file 689 $Lines = file($FileName, FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
691 # if file contained lines 694 # for each line of file 695 foreach ($Lines as $Line)
697 # if line is not a comment 698 if (!preg_match(
"/[\s]*#/", $Line))
700 # split line into words 701 $Words = preg_split(
"/[\s,]+/", $Line);
704 if (count($Words) > 1)
706 # separate out word and synonyms 707 $Word = array_shift($Words);
716 # return count of synonyms added to caller 721 # ---- PRIVATE INTERFACE ------------------------------------------------- 734 private $ExcludedTermCount;
737 private $InclusiveTermCount;
738 private $RequiredTermCount;
739 private $RequiredTermCounts;
740 private $SearchTermList;
741 private $WordCountAdded;
747 # ---- private methods (searching) 756 private function RawSearch($SearchParams)
758 # retrieve search strings 759 $SearchStrings = $SearchParams->GetSearchStrings();
760 $KeywordSearchStrings = $SearchParams->GetKeywordSearchStrings();
762 # add keyword searches (if any) to fielded searches 763 if (count($KeywordSearchStrings))
765 $SearchStrings[self::KEYWORD_FIELD_ID] = $KeywordSearchStrings;
768 # normalize search strings 769 $NormalizedSearchStrings = array();
770 foreach ($SearchStrings as $FieldId => $SearchStringArray)
772 if (!is_array($SearchStringArray))
774 $SearchStringArray = array($SearchStringArray);
776 foreach ($SearchStringArray as $String)
778 $String = trim($String);
781 $NormalizedSearchStrings[$FieldId][] = $String;
785 $SearchStrings = $NormalizedSearchStrings;
787 # if we have strings to search for 788 if (count($SearchStrings))
791 $Scores = $this->SearchAcrossFields(
792 $SearchStrings, $SearchParams->Logic());
796 foreach ($SearchParams->GetSubgroups() as $Subgroup)
798 # perform subgroup search 799 $NewScores = $this->RawSearch($Subgroup);
801 # added subgroup search scores to previous scores as appropriate 804 $Scores = $this->CombineScores(
805 $Scores, $NewScores, $SearchParams->Logic());
809 $Scores = $NewScores;
812 if (isset($NewScores))
814 $this->
DMsg(2,
"Have ".count($Scores)
815 .
" results after subgroup processing");
818 # pare down results to just allowed item types (if specified) 819 if ($SearchParams->ItemTypes())
821 $AllowedItemTypes = $SearchParams->ItemTypes();
822 foreach ($Scores as $ItemId => $Score)
824 if (!in_array($this->GetItemType($ItemId), $AllowedItemTypes))
826 unset($Scores[$ItemId]);
829 $this->
DMsg(3,
"Have ".count($Scores)
830 .
" results after paring to allowed item types");
833 # return search results to caller 834 return isset($Scores) ? $Scores : array();
844 private function CombineScores($ScoresA, $ScoresB, $Logic)
849 foreach ($ScoresB as $ItemId => $Score)
851 if (isset($Scores[$ItemId]))
853 $Scores[$ItemId] += $Score;
857 $Scores[$ItemId] = $Score;
864 foreach ($ScoresA as $ItemId => $Score)
866 if (isset($ScoresB[$ItemId]))
868 $Scores[$ItemId] = $Score + $ScoresB[$ItemId];
884 private function SearchAcrossFields($SearchStrings, $Logic)
886 # start by assuming no search will be done 890 $this->ExcludedTermCount = 0;
891 $this->InclusiveTermCount = 0;
892 $this->RequiredTermCount = 0;
893 $this->RequiredTermCounts = array();
896 $NeedComparisonSearch = FALSE;
897 foreach ($SearchStrings as $FieldId => $SearchStringArray)
899 # for each search string for this field 900 foreach ($SearchStringArray as $SearchString)
902 # if field is keyword or field is text and does not look 903 # like comparison match 904 $NotComparisonSearch = !preg_match(
905 self::COMPARISON_OPERATOR_PATTERN, $SearchString);
906 if (($FieldId == self::KEYWORD_FIELD_ID)
907 || (isset($this->FieldInfo[$FieldId])
908 && ($this->FieldInfo[$FieldId][
"FieldType"]
909 == self::FIELDTYPE_TEXT)
910 && $NotComparisonSearch))
912 $this->
DMsg(0,
"Searching text field \"" 913 .$FieldId.
"\" for string \"$SearchString\"");
915 # normalize text and split into words 917 $this->ParseSearchStringForWords($SearchString, $Logic);
919 # calculate scores for matching items 920 if (count($Words[$FieldId]))
922 $Scores = $this->SearchForWords(
923 $Words[$FieldId], $FieldId, $Scores);
924 $this->
DMsg(3,
"Have " 925 .count($Scores).
" results after word search");
929 $Phrases[$FieldId] = $this->ParseSearchStringForPhrases(
930 $SearchString, $Logic);
933 if (count($Phrases[$FieldId]))
935 $Scores = $this->SearchForPhrases(
936 $Phrases[$FieldId], $Scores, $FieldId, TRUE, FALSE);
937 $this->
DMsg(3,
"Have " 938 .count($Scores).
" results after phrase search");
943 # set flag to indicate possible comparison search candidate found 944 $NeedComparisonSearch = TRUE;
949 # perform comparison searches 950 if ($NeedComparisonSearch)
952 $Scores = $this->SearchForComparisonMatches(
953 $SearchStrings, $Logic, $Scores);
954 $this->
DMsg(3,
"Have ".count($Scores).
" results after comparison search");
957 # if no results found and exclusions specified 958 if (!count($Scores) && $this->ExcludedTermCount)
961 $Scores = $this->LoadScoresForAllRecords();
964 # if search results found 967 # for each search text string 968 foreach ($SearchStrings as $FieldId => $SearchStringArray)
970 # for each search string for this field 971 foreach ($SearchStringArray as $SearchString)
974 if (($FieldId == self::KEYWORD_FIELD_ID)
975 || (isset($this->FieldInfo[$FieldId])
976 && ($this->FieldInfo[$FieldId][
"FieldType"]
977 == self::FIELDTYPE_TEXT)))
979 # if there are words in search text 980 if (isset($Words[$FieldId]))
982 # handle any excluded words 983 $Scores = $this->FilterOnExcludedWords(
984 $Words[$FieldId], $Scores, $FieldId);
987 # handle any excluded phrases 988 if (isset($Phrases[$FieldId]))
990 $Scores = $this->SearchForPhrases(
991 $Phrases[$FieldId], $Scores,
992 $FieldId, FALSE, TRUE);
996 $this->
DMsg(3,
"Have ".count($Scores)
997 .
" results after processing exclusions");
1000 # strip off any results that don't contain required words 1001 $Scores = $this->FilterOnRequiredWords($Scores);
1004 # return search result scores to caller 1017 private function SearchForWords($Words, $FieldId, $Scores = NULL)
1021 # start with empty search result scores list if none passed in 1022 if ($Scores == NULL)
1028 foreach ($Words as $Word => $Flags)
1031 $this->
DMsg(2,
"Searching for word '${Word}' in field ".$FieldId);
1033 # if word is not excluded 1034 if (!($Flags & self::WORD_EXCLUDED))
1036 # look up record ID for word 1037 $this->
DMsg(2,
"Looking up word \"".$Word.
"\"");
1038 $WordId = $this->GetWordId($Word);
1041 if ($WordId !== NULL)
1043 # look up counts for word 1044 $DB->Query(
"SELECT ItemId,Count FROM SearchWordCounts " 1045 .
"WHERE WordId = ".$WordId
1046 .
" AND FieldId = ".$FieldId);
1047 $Counts =
$DB->FetchColumn(
"Count",
"ItemId");
1049 # if synonym support is enabled 1050 if ($this->SynonymsEnabled)
1052 # look for any synonyms 1053 $DB->Query(
"SELECT WordIdA, WordIdB" 1054 .
" FROM SearchWordSynonyms" 1055 .
" WHERE WordIdA = ".$WordId
1056 .
" OR WordIdB = ".$WordId);
1058 # if synonyms were found 1059 if (
$DB->NumRowsSelected())
1061 # retrieve synonym IDs 1062 $SynonymIds = array();
1063 while ($Record =
$DB->FetchRow())
1065 $SynonymIds[] = ($Record[
"WordIdA"] == $WordId)
1066 ? $Record[
"WordIdB"]
1067 : $Record[
"WordIdA"];
1071 foreach ($SynonymIds as $SynonymId)
1073 # retrieve counts for synonym 1074 $DB->Query(
"SELECT ItemId,Count" 1075 .
" FROM SearchWordCounts" 1076 .
" WHERE WordId = ".$SynonymId
1077 .
" AND FieldId = ".$FieldId);
1078 $SynonymCounts =
$DB->FetchColumn(
"Count",
"ItemId");
1081 foreach ($SynonymCounts as $ItemId => $Count)
1083 # adjust count because it's a synonym 1084 $AdjustedCount = ceil($Count / 2);
1086 # add count to existing counts 1087 if (isset($Counts[$ItemId]))
1089 $Counts[$ItemId] += $AdjustedCount;
1093 $Counts[$ItemId] = $AdjustedCount;
1101 # if stemming is enabled 1102 if ($this->StemmingEnabled)
1105 $Stem = PorterStemmer::Stem($Word);
1106 $this->
DMsg(2,
"Looking up stem \"".$Stem.
"\"");
1107 $StemId = $this->GetStemId($Stem);
1109 # if ID found for stem 1110 if ($StemId !== NULL)
1112 # retrieve counts for stem 1113 $DB->Query(
"SELECT ItemId,Count" 1114 .
" FROM SearchWordCounts" 1115 .
" WHERE WordId = ".$StemId
1116 .
" AND FieldId = ".$FieldId);
1117 $StemCounts =
$DB->FetchColumn(
"Count",
"ItemId");
1120 foreach ($StemCounts as $ItemId => $Count)
1122 # adjust count because it's a stem 1123 $AdjustedCount = ceil($Count / 2);
1125 # add count to existing counts 1126 if (isset($Counts[$ItemId]))
1128 $Counts[$ItemId] += $AdjustedCount;
1132 $Counts[$ItemId] = $AdjustedCount;
1138 # if counts were found 1142 foreach ($Counts as $ItemId => $Count)
1144 # if word flagged as required 1145 if ($Flags & self::WORD_REQUIRED)
1147 # increment required word count for record 1148 if (isset($this->RequiredTermCounts[$ItemId]))
1150 $this->RequiredTermCounts[$ItemId]++;
1154 $this->RequiredTermCounts[$ItemId] = 1;
1158 # add to item record score 1159 if (isset($Scores[$ItemId]))
1161 $Scores[$ItemId] += $Count;
1165 $Scores[$ItemId] = $Count;
1172 # return basic scores to caller 1183 private function ParseSearchStringForPhrases($SearchString, $Logic)
1185 # split into chunks delimited by double quote marks 1186 $Pieces = explode(
"\"", $SearchString); #
" 1188 # for each pair of chunks 1191 while ($Index < count($Pieces)) 1193 # grab phrase from chunk 1194 $Phrase = trim(addslashes($Pieces[$Index - 1])); 1195 $Flags = self::WORD_PRESENT; 1197 # grab first character of phrase 1198 $FirstChar = substr($Pieces[$Index - 2], -1); 1200 # set flags to reflect any option characters 1201 if ($FirstChar == "-
") 1203 $Flags |= self::WORD_EXCLUDED; 1204 if (!isset($Phrases[$Phrase])) 1206 $this->ExcludedTermCount++; 1211 if ((($Logic == "AND
") 1212 && ($FirstChar != "~
")) 1213 || ($FirstChar == "+
")) 1215 $Flags |= self::WORD_REQUIRED; 1216 if (!isset($Phrases[$Phrase])) 1218 $this->RequiredTermCount++; 1221 if (!isset($Phrases[$Phrase])) 1223 $this->InclusiveTermCount++; 1224 $this->SearchTermList[] = $Phrase; 1227 $Phrases[$Phrase] = $Flags; 1229 # move to next pair of chunks 1233 # return phrases to caller 1242 protected function SearchFieldForPhrases($FieldId, $Phrase) 1259 private function SearchForPhrases($Phrases, $Scores, $FieldId,
1260 $ProcessNonExcluded = TRUE, $ProcessExcluded = TRUE)
1262 # if phrases are found 1263 if (count($Phrases) > 0)
1265 # if this is a keyword search 1266 if ($FieldId == self::KEYWORD_FIELD_ID)
1269 foreach ($this->FieldInfo as $KFieldId => $Info)
1271 # if field is marked to be included in keyword searches 1272 if ($Info[
"InKeywordSearch"])
1274 # call ourself with that field 1275 $Scores = $this->SearchForPhrases(
1276 $Phrases, $Scores, $KFieldId,
1277 $ProcessNonExcluded, $ProcessExcluded);
1284 foreach ($Phrases as $Phrase => $Flags)
1286 $this->
DMsg(2,
"Searching for phrase '".$Phrase
1287 .
"' in field ".$FieldId);
1289 # if phrase flagged as excluded and we are doing excluded 1290 # phrases or phrase flagged as non-excluded and we 1291 # are doing non-excluded phrases 1292 if (($ProcessExcluded && ($Flags & self::WORD_EXCLUDED))
1293 || ($ProcessNonExcluded && !($Flags & self::WORD_EXCLUDED)))
1295 # initialize score list if necessary 1296 if ($Scores === NULL) { $Scores = array(); }
1298 # retrieve list of items that contain phrase 1302 # for each item that contains phrase 1303 foreach ($ItemIds as $ItemId)
1305 # if we are doing excluded phrases and phrase 1306 # is flagged as excluded 1307 if ($ProcessExcluded && ($Flags & self::WORD_EXCLUDED))
1309 # knock item off of list 1310 unset($Scores[$ItemId]);
1312 elseif ($ProcessNonExcluded)
1314 # calculate phrase value based on number of 1315 # words and field weight 1316 $PhraseScore = count(preg_split(
"/[\s]+/",
1317 $Phrase, -1, PREG_SPLIT_NO_EMPTY))
1318 * $this->FieldInfo[$FieldId][
"Weight"];
1319 $this->
DMsg(2,
"Phrase score is ".$PhraseScore);
1321 # bump up item record score 1322 if (isset($Scores[$ItemId]))
1324 $Scores[$ItemId] += $PhraseScore;
1328 $Scores[$ItemId] = $PhraseScore;
1331 # if phrase flagged as required 1332 if ($Flags & self::WORD_REQUIRED)
1334 # increment required word count for record 1335 if (isset($this->RequiredTermCounts[$ItemId]))
1337 $this->RequiredTermCounts[$ItemId]++;
1341 $this->RequiredTermCounts[$ItemId] = 1;
1351 # return updated scores to caller 1363 private function FilterOnExcludedWords($Words, $Scores, $FieldId)
1368 foreach ($Words as $Word => $Flags)
1370 # if word flagged as excluded 1371 if ($Flags & self::WORD_EXCLUDED)
1373 # look up record ID for word 1374 $WordId = $this->GetWordId($Word);
1377 if ($WordId !== NULL)
1379 # look up counts for word 1380 $DB->Query(
"SELECT ItemId FROM SearchWordCounts " 1381 .
"WHERE WordId=${WordId} AND FieldId=${FieldId}");
1384 while ($Record =
$DB->FetchRow())
1386 # if item record is in score list 1387 $ItemId = $Record[
"ItemId"];
1388 if (isset($Scores[$ItemId]))
1390 # remove item record from score list 1391 $this->
DMsg(3,
"Filtering out item ".$ItemId
1392 .
" because it contained word \"".$Word.
"\"");
1393 unset($Scores[$ItemId]);
1400 # returned filtered score list to caller 1409 private function FilterOnRequiredWords($Scores)
1411 # if there were required words 1412 if ($this->RequiredTermCount > 0)
1415 foreach ($Scores as $ItemId => $Score)
1417 # if item does not meet required word count 1418 if (!isset($this->RequiredTermCounts[$ItemId])
1419 || ($this->RequiredTermCounts[$ItemId]
1420 < $this->RequiredTermCount))
1423 $this->
DMsg(4,
"Filtering out item ".$ItemId
1424 .
" because it didn't have required word count of " 1425 .$this->RequiredTermCount
1426 .(isset($this->RequiredTermCounts[$ItemId])
1428 .$this->RequiredTermCounts[$ItemId]
1431 unset($Scores[$ItemId]);
1436 # return filtered list to caller 1452 private function CleanScores($Scores, $StartingResult, $NumberOfResults,
1453 $SortByField, $SortDescending)
1455 # perform any requested filtering 1456 $this->
DMsg(0,
"Have ".count($Scores).
" results before filter callbacks");
1459 # save total number of results available 1460 $this->NumberOfResultsAvailable = count($Scores);
1462 # sort search scores into item type bins 1463 $NewScores = array();
1464 foreach ($Scores as $Id => $Score)
1466 $ItemType = $this->GetItemType($Id);
1467 if ($ItemType !== NULL)
1469 $NewScores[$ItemType][$Id] = $Score;
1472 $Scores = $NewScores;
1474 # for each item type 1475 $NewSortByField = array();
1476 $NewSortDescending = array();
1477 foreach ($Scores as $ItemType => $TypeScores)
1479 # normalize sort field parameter 1480 $NewSortByField[$ItemType] = !is_array($SortByField) ? $SortByField
1481 : (isset($SortByField[$ItemType])
1482 ? $SortByField[$ItemType] : NULL);
1484 # normalize sort direction parameter 1485 $NewSortDescending[$ItemType] = !is_array($SortDescending) ? $SortDescending
1486 : (isset($SortDescending[$ItemType])
1487 ? $SortDescending[$ItemType] : TRUE);
1489 $SortByField = $NewSortByField;
1490 $SortDescending = $NewSortDescending;
1492 # for each item type 1493 foreach ($Scores as $ItemType => $TypeScores)
1495 # save number of results 1496 $this->NumberOfResultsPerItemType[$ItemType] = count($TypeScores);
1498 # if no sorting field specified 1499 if ($SortByField[$ItemType] === NULL)
1501 # sort result list by score 1502 if ($SortDescending[$ItemType])
1504 arsort($Scores[$ItemType], SORT_NUMERIC);
1508 asort($Scores[$ItemType], SORT_NUMERIC);
1513 # get list of item IDs in sorted order 1514 $SortedIds = $this->GetItemIdsSortedByField($ItemType,
1515 $SortByField[$ItemType], $SortDescending[$ItemType]);
1517 # if we have sorted item IDs 1518 if (count($SortedIds) && count($TypeScores))
1520 # strip sorted ID list down to those that appear in search results 1521 $SortedIds = array_intersect($SortedIds,
1522 array_keys($TypeScores));
1524 # rebuild score list in sorted order 1525 $NewScores = array();
1526 foreach ($SortedIds as $Id)
1528 $NewScores[$Id] = $TypeScores[$Id];
1530 $Scores[$ItemType] = $NewScores;
1534 # sort result list by score 1535 arsort($Scores[$ItemType], SORT_NUMERIC);
1539 # if subset of scores requested 1540 if (($StartingResult > 0) || ($NumberOfResults < PHP_INT_MAX))
1542 # trim scores back to requested subset 1543 $ScoresKeys = array_slice(array_keys($Scores[$ItemType]),
1544 $StartingResult, $NumberOfResults);
1545 $NewScores = array();
1546 foreach ($ScoresKeys as $Key)
1548 $NewScores[$Key] = $Scores[$ItemType][$Key];
1550 $Scores[$ItemType] = $NewScores;
1554 # returned cleaned search result scores list to caller 1565 # if filter functions have been set 1566 if (isset($this->FilterFuncs))
1569 foreach ($Scores as $ItemId => $Score)
1571 # for each filter function 1572 foreach ($this->FilterFuncs as $FuncName)
1574 # if filter function return TRUE for item 1575 if (call_user_func($FuncName, $ItemId))
1578 $this->
DMsg(2,
"Filter callback <i>".$FuncName
1579 .
"</i> rejected item ".$ItemId);
1580 unset($Scores[$ItemId]);
1582 # bail out of filter func loop 1589 # return filtered list to caller 1602 private function SearchForComparisonMatches($SearchStrings, $Logic, $Scores)
1606 foreach ($SearchStrings as $SearchFieldId => $SearchStringArray)
1608 # if field is not keyword 1609 if ($SearchFieldId != self::KEYWORD_FIELD_ID)
1611 # for each search string for this field 1612 foreach ($SearchStringArray as $SearchString)
1614 # look for comparison operators 1615 $FoundOperator = preg_match(
1616 self::COMPARISON_OPERATOR_PATTERN,
1617 $SearchString, $Matches);
1619 # if a comparison operator was found 1620 # or this is a field type that is always a comparison search 1621 if ($FoundOperator ||
1622 ($this->FieldInfo[$SearchFieldId][
"FieldType"]
1623 != self::FIELDTYPE_TEXT))
1625 # determine value to compare against 1626 $Value = trim(preg_replace(
1627 self::COMPARISON_OPERATOR_PATTERN,
'\2',
1630 # if no comparison operator was found 1631 if (!$FoundOperator)
1633 # assume comparison is equality 1634 $Operators[$Index] =
"=";
1638 # use operator from comparison match 1639 $Operators[$Index] = $Matches[1];
1642 # if operator was found 1643 if (isset($Operators[$Index]))
1646 $Values[$Index] = $Value;
1649 $FieldIds[$Index] = $SearchFieldId;
1650 $this->
DMsg(3,
"Added comparison (field = <i>" 1651 .$FieldIds[$Index].
"</i> op = <i>" 1652 .$Operators[$Index].
"</i> val = <i>" 1653 .$Values[$Index].
"</i>)");
1655 # move to next comparison array entry 1663 # if comparisons found 1664 if (isset($Operators))
1666 # perform comparisons on fields and gather results 1667 $Results = $this->SearchFieldsForComparisonMatches(
1668 $FieldIds, $Operators, $Values, $Logic);
1670 # if search logic is set to AND 1671 if ($Logic ==
"AND")
1673 # if results were found 1674 if (count($Results))
1676 # if there were no prior results and no terms for keyword search 1677 if ((count($Scores) == 0) && ($this->InclusiveTermCount == 0))
1679 # add all results to scores 1680 foreach ($Results as $ItemId)
1682 $Scores[$ItemId] = 1;
1687 # remove anything from scores that is not part of results 1688 foreach ($Scores as $ItemId => $Score)
1690 if (in_array($ItemId, $Results) == FALSE)
1692 unset($Scores[$ItemId]);
1705 # add result items to scores 1706 if ($Scores === NULL) { $Scores = array(); }
1707 foreach ($Results as $ItemId)
1709 if (isset($Scores[$ItemId]))
1711 $Scores[$ItemId] += 1;
1715 $Scores[$ItemId] = 1;
1721 # return results to caller 1732 private function SetDebugLevel($SearchStrings)
1734 # if search info is an array 1735 if (is_array($SearchStrings))
1737 # for each array element 1738 foreach ($SearchStrings as $FieldId => $SearchStringArray)
1740 # if element is an array 1741 if (is_array($SearchStringArray))
1743 # for each array element 1744 foreach ($SearchStringArray as $Index => $SearchString)
1746 # pull out search string if present 1747 $SearchStrings[$FieldId][$Index] =
1748 $this->ExtractDebugLevel($SearchString);
1753 # pull out search string if present 1754 $SearchStrings[$FieldId] =
1755 $this->ExtractDebugLevel($SearchStringArray);
1761 # pull out search string if present 1762 $SearchStrings = $this->ExtractDebugLevel($SearchStrings);
1765 # return new search info to caller 1766 return $SearchStrings;
1775 private function ExtractDebugLevel($SearchString)
1777 # if search string contains debug level indicator 1778 if (strstr($SearchString,
"DBUGLVL="))
1780 # remove indicator and set debug level 1781 $Level = preg_replace(
"/^\\s*DBUGLVL=([1-9]{1,2}).*/",
"\\1", $SearchString);
1785 $this->
DMsg(0,
"Setting debug level to ".$Level);
1786 $SearchString = preg_replace(
"/\s*DBUGLVL=${Level}\s*/",
"",
1791 # return (possibly) modified search string to caller 1792 return $SearchString;
1799 private function LoadScoresForAllRecords()
1801 # start with empty list 1805 $this->DB->Query(
"SELECT ".$this->ItemIdFieldName
1806 .
" FROM ".$this->ItemTableName);
1807 while ($Record = $this->DB->FetchRow())
1809 # set score for item to 1 1813 # return array with all scores to caller 1818 # ---- private methods (search DB building) 1827 private function UpdateWordCount($Word, $ItemId, $FieldId, $Weight = 1)
1829 # retrieve ID for word 1830 $WordIds[] = $this->GetWordId($Word, TRUE);
1832 # if stemming is enabled and word looks appropriate for stemming 1833 if ($this->StemmingEnabled && !is_numeric($Word))
1835 # retrieve stem of word 1836 $Stem = PorterStemmer::Stem($Word, TRUE);
1838 # if stem is different 1841 # retrieve ID for stem of word 1842 $WordIds[] = $this->GetStemId($Stem, TRUE);
1846 # for word and stem of word 1847 foreach ($WordIds as $WordId)
1849 # if word count already added to database 1850 if (isset($this->WordCountAdded[$WordId][$FieldId]))
1853 $this->DB->Query(
"UPDATE SearchWordCounts SET Count=Count+".$Weight
1854 .
" WHERE WordId=".$WordId
1855 .
" AND ItemId=".$ItemId
1856 .
" AND FieldId=".$FieldId);
1860 # add word count to DB 1861 $this->DB->Query(
"INSERT INTO SearchWordCounts" 1862 .
" (WordId, ItemId, FieldId, Count) VALUES" 1863 .
" (".$WordId.
", ".$ItemId.
", ".$FieldId.
", ".$Weight.
")");
1865 # remember that we added count for this word 1866 $this->WordCountAdded[$WordId][$FieldId] = TRUE;
1869 # decrease weight for stem 1870 $Weight = ceil($Weight / 2);
1882 throw Exception(
"GetFieldContent() not implemented.");
1894 private function RecordSearchInfoForText(
1895 $ItemId, $FieldId, $Weight, $Text, $IncludeInKeyword)
1898 $Words = $this->ParseSearchStringForWords($Text,
"OR", TRUE);
1900 # if there was text left after parsing 1901 if (count($Words) > 0)
1904 foreach ($Words as $Word => $Flags)
1906 # update count for word 1907 $this->UpdateWordCount($Word, $ItemId, $FieldId);
1909 # if text should be included in keyword searches 1910 if ($IncludeInKeyword)
1912 # update keyword field count for word 1913 $this->UpdateWordCount(
1914 $Word, $ItemId, self::KEYWORD_FIELD_ID, $Weight);
1920 # ---- common private methods (used in both searching and DB build) 1932 private function ParseSearchStringForWords(
1933 $SearchString, $Logic, $IgnorePhrases = FALSE)
1935 # strip off any surrounding whitespace 1936 $Text = trim($SearchString);
1938 # set up normalization replacement strings 1940 "/'s[^a-z0-9\\-+~]+/i", #
get rid of possessive plurals
1941 "/'/", #
get rid of single quotes / apostrophes
1942 "/\"[^\"]*\"/", #
get rid of phrases (NOTE: HARD-CODED
1944 "/\\([^)]*\\)/
", # get rid of groups (NOTE: HARD-CODED 1946 "/[^a-z0-9\\-+~]+/i
", # convert non-alphanumerics 1947 # / non-minus/plus to a space 1948 "/([^\\s])-+/i
", # convert minus preceded by anything 1949 # but whitespace to a space 1950 "/([^\\s])\\++/i
", # convert plus preceded by anything 1951 # but whitespace to a space 1952 "/-\\s/i
", # convert minus followed by whitespace to a space 1953 "/\\+\\s/i
", # convert plus followed by whitespace to a space 1954 "/~\\s/i
", # convert tilde followed by whitespace to a space 1955 "/[ ]+/
" # convert multiple spaces to one space 1957 $Replacements = array( 1971 # if we are supposed to ignore phrases and groups (series of words 1972 # in quotes or surrounded by parens) 1975 # switch phrase removal to double quote removal (HARD-CODED 1976 # INDEX INTO PATTERN LIST!!) 1977 $Patterns[2] = "/\
"/";
1979 # switch group removal to paren removal (HARD-CODED INDEX 1980 # INTO PATTERN LIST!!) 1981 $Patterns[3] =
"/[\(\)]+/";
1984 # remove punctuation from text and normalize whitespace 1985 $Text = preg_replace($Patterns, $Replacements, $Text);
1986 $this->
DMsg(2,
"Normalized search string is '".$Text.
"'");
1988 # convert text to lower case 1989 $Text = strtolower($Text);
1991 # strip off any extraneous whitespace 1992 $Text = trim($Text);
1994 # start with an empty array 1997 # if we have no words left after parsing 1998 if (strlen($Text) != 0)
2001 foreach (explode(
" ", $Text) as $Word)
2003 # grab first character of word 2004 $FirstChar = substr($Word, 0, 1);
2006 # strip off option characters and set flags appropriately 2007 $Flags = self::WORD_PRESENT;
2008 if ($FirstChar ==
"-")
2010 $Word = substr($Word, 1);
2011 $Flags |= self::WORD_EXCLUDED;
2012 if (!isset($Words[$Word]))
2014 $this->ExcludedTermCount++;
2019 if ($FirstChar ==
"~")
2021 $Word = substr($Word, 1);
2023 elseif (($Logic ==
"AND")
2024 || ($FirstChar ==
"+"))
2026 if ($FirstChar ==
"+")
2028 $Word = substr($Word, 1);
2030 $Flags |= self::WORD_REQUIRED;
2031 if (!isset($Words[$Word]))
2033 $this->RequiredTermCount++;
2036 if (!isset($Words[$Word]))
2038 $this->InclusiveTermCount++;
2039 $this->SearchTermList[] = $Word;
2043 # store flags to indicate word found 2044 $Words[$Word] = $Flags;
2045 $this->
DMsg(3,
"Word identified (".$Word.
")");
2049 # return normalized words to caller 2060 private function GetWordId($Word, $AddIfNotFound = FALSE)
2062 static $WordIdCache;
2064 # if word was in ID cache 2065 if (isset($WordIdCache[$Word]))
2068 $WordId = $WordIdCache[$Word];
2072 # look up ID in database 2073 $WordId = $this->DB->Query(
"SELECT WordId" 2074 .
" FROM SearchWords" 2075 .
" WHERE WordText='".addslashes($Word).
"'",
2078 # if ID was not found and caller requested it be added 2079 if (($WordId === NULL) && $AddIfNotFound)
2081 # add word to database 2082 $this->DB->Query(
"INSERT INTO SearchWords (WordText)" 2083 .
" VALUES ('".addslashes(strtolower($Word)).
"')");
2085 # get ID for newly added word 2086 $WordId = $this->DB->LastInsertId();
2090 $WordIdCache[$Word] = $WordId;
2093 # return ID to caller 2104 private function GetStemId($Stem, $AddIfNotFound = FALSE)
2106 static $StemIdCache;
2108 # if stem was in ID cache 2109 if (isset($StemIdCache[$Stem]))
2112 $StemId = $StemIdCache[$Stem];
2116 # look up ID in database 2117 $StemId = $this->DB->Query(
"SELECT WordId" 2118 .
" FROM SearchStems" 2119 .
" WHERE WordText='".addslashes($Stem).
"'",
2122 # if ID was not found and caller requested it be added 2123 if (($StemId === NULL) && $AddIfNotFound)
2125 # add stem to database 2126 $this->DB->Query(
"INSERT INTO SearchStems (WordText)" 2127 .
" VALUES ('".addslashes(strtolower($Stem)).
"')");
2129 # get ID for newly added stem 2130 $StemId = $this->DB->LastInsertId();
2133 # adjust from DB ID value to stem ID value 2134 $StemId += self::STEM_ID_OFFSET;
2137 $StemIdCache[$Stem] = $StemId;
2140 # return ID to caller 2149 private function GetWord($WordId)
2153 # if word was in cache 2154 if (isset($WordCache[$WordId]))
2156 # use word from cache 2157 $Word = $WordCache[$WordId];
2161 # adjust search location and word ID if word is stem 2162 $TableName =
"SearchWords";
2163 if ($WordId >= self::STEM_ID_OFFSET)
2165 $TableName =
"SearchStems";
2166 $WordId -= self::STEM_ID_OFFSET;
2169 # look up word in database 2170 $Word = $this->DB->Query(
"SELECT WordText" 2171 .
" FROM ".$TableName
2172 .
" WHERE WordId='".$WordId.
"'",
2175 # save word to cache 2176 $WordCache[$WordId] = $Word;
2179 # return word to caller 2188 private function GetItemType($ItemId)
2190 static $ItemTypeCache;
2191 if (!isset($ItemTypeCache))
2193 $this->DB->Query(
"SELECT * FROM SearchItemTypes");
2194 $ItemTypeCache = $this->DB->FetchColumn(
"ItemType",
"ItemId");
2196 return isset($ItemTypeCache[$ItemId])
2197 ? (int)$ItemTypeCache[$ItemId] : NULL;
2205 protected function DMsg($Level, $Msg)
2209 print
"SE: ".$Msg.
"<br>\n";
2213 # ---- BACKWARD COMPATIBILITY -------------------------------------------- 2215 # possible types of logical operators 2219 # pattern to detect search strings that are explicit comparisons
SearchTermCount()
Get total number of search terms indexed by search engine.
SetAllSynonyms($SynonymList)
Set all synonyms.
DropItem($ItemId)
Drop all data pertaining to item from search database.
AddField($FieldId, $FieldType, $ItemTypes, $Weight, $UsedInKeywordSearch)
Add field to include in searching.
RemoveSynonyms($Word, $Synonyms=NULL)
Remove synonym(s).
LoadSynonymsFromFile($FileName)
Load synonyms from a file.
Set of parameters used to perform a search.
SQL database abstraction object with smart query caching.
SearchFieldForPhrases($FieldId, $Phrase)
Search for phrase in specified field.
GetAllSynonyms()
Get all synonyms.
FilterOnSuppliedFunctions($Scores)
Filter search scores through any supplied functions.
UpdateForItem($ItemId, $ItemType)
Update search database for the specified item.
AddSynonyms($Word, $Synonyms)
Add synonyms.
const FIELDTYPE_DATERANGE
SearchTerms()
Get normalized list of search terms.
NumberOfResults($ItemType=NULL)
Get number of results found by most recent search.
FieldWeight($FieldId)
Get search weight for specified field.
FieldType($FieldId)
Get type of specified field (text/numeric/date/daterange).
ItemCount()
Get total number of items indexed by search engine.
FieldedSearch($SearchStrings, $StartingResult=0, $NumberOfResults=10, $SortByField=NULL, $SortDescending=TRUE)
Perform search across multiple fields, with different values or comparisons specified for each field...
__construct($ItemTableName, $ItemIdFieldName, $ItemTypeFieldName)
Object constructor.
Search($SearchParams, $StartingResult=0, $NumberOfResults=PHP_INT_MAX, $SortByField=NULL, $SortDescending=TRUE)
Perform search with specified parameters.
RemoveAllSynonyms()
Remove all synonyms.
DMsg($Level, $Msg)
Print debug message if level set high enough.
DropField($FieldId)
Drop all data pertaining to field from search database.
GetFieldContent($ItemId, $FieldId)
Retrieve content for specified field for specified item.
Core metadata archive search engine class.
$NumberOfResultsAvailable
const COMPARISON_OPERATOR_PATTERN
DebugLevel($NewValue)
Set debug output level.
UpdateForItems($StartingItemId, $NumberOfItems)
Update search database for the specified range of items.
FieldedSearchWeightScale($SearchParams)
Get total of weights for all fields involved in search, useful for assessing scale of scores in searc...
FieldInKeywordSearch($FieldId)
Get whether specified field is included in keyword searches.
AddResultFilterFunction($FunctionName)
Add function that will be called to filter search results.
SearchTime()
Get time that last search took, in seconds.
GetSynonyms($Word)
Get synonyms for word.