4 # FILE: SPT--Recommender.php
6 # Part of the Collection Workflow Integration System (CWIS)
7 # Copyright 2004-2013 Edward Almasy and Internet Scout Research Group
8 # http://scout.wisc.edu/cwis/
13 # ---- PUBLIC INTERFACE --------------------------------------------------
14 # define content field types
26 # set default parameters
27 $this->ContentCorrelationThreshold = 1;
29 # save database object
32 # save new configuration values
40 # set default debug state
44 # set level for debugging output
51 # ---- recommendation methods
53 # recommend items for specified user
54 function Recommend($UserId, $StartingResult = 0, $NumberOfResults = 10)
56 if ($this->
DebugLevel > 0) { print(
"REC: Recommend(${UserId}, ${StartingResult}, ${NumberOfResults})<br>\n"); }
58 # load in user ratings
61 $DB->Query(
"SELECT ".$this->ItemIdFieldName.
", ".$this->RatingFieldName
62 .
" FROM ".$this->RatingTableName
63 .
" WHERE ".$this->UserIdFieldName.
" = ${UserId}");
64 while ($Row =
$DB->FetchRow())
69 if ($this->
DebugLevel > 1) { print(
"REC: user has rated ".count($Ratings).
" items<br>\n"); }
71 # for each item that user has rated
73 foreach ($Ratings as $ItemId => $ItemRating)
75 # for each content correlation available for that item
76 $DB->Query(
"SELECT Correlation, ItemIdB "
77 .
"FROM RecContentCorrelations "
78 .
"WHERE ItemIdA = ${ItemId}");
79 while ($Row =
$DB->FetchRow())
81 # multiply that correlation by normalized rating and add
82 # resulting value to recommendation value for that item
83 if (isset($RecVals[$Row[
"ItemIdB"]]))
85 $RecVals[$Row[
"ItemIdB"]] +=
86 $Row[
"Correlation"] * ($ItemRating - 50);
90 $RecVals[$Row[
"ItemIdB"]] =
91 $Row[
"Correlation"] * ($ItemRating - 50);
93 if ($this->
DebugLevel > 9) { print(
"REC: RecVal[".$Row[
"ItemIdB"].
"] = ".$RecVals[$Row[
"ItemIdB"]].
"<br>\n"); }
96 if ($this->
DebugLevel > 1) { print(
"REC: found ".count($RecVals).
" total recommendations<br>\n"); }
98 # calculate average correlation between items
99 $ResultThreshold =
$DB->Query(
"SELECT AVG(Correlation) "
100 .
"AS Average FROM RecContentCorrelations",
"Average");
101 $ResultThreshold = round($ResultThreshold) * 2;
103 # for each recommended item
104 foreach ($RecVals as $ItemId => $RecVal)
106 # remove item from list if user already rated it
107 if (isset($Ratings[$ItemId]))
109 unset($RecVals[$ItemId]);
113 # scale recommendation value back to match thresholds
114 $RecVals[$ItemId] = round($RecVal / 50);
116 # remove item from recommendation list if value is below threshold
117 if ($RecVals[$ItemId] < $ResultThreshold)
119 unset($RecVals[$ItemId]);
123 if ($this->
DebugLevel > 1) { print(
"REC: found ".count($RecVals).
" positive recommendations<br>\n"); }
125 # sort recommendation list by value
126 if (isset($RecVals)) { arsort($RecVals, SORT_NUMERIC); }
128 # save total number of results available
129 $this->NumberOfResultsAvailable = count($RecVals);
131 # trim result list to match range requested by caller
132 $RecValKeys = array_slice(
133 array_keys($RecVals), $StartingResult, $NumberOfResults);
134 $RecValSegment = array();
135 foreach ($RecValKeys as $Key)
137 $RecValSegment[$Key] = $RecVals[$Key];
140 # return recommendation list to caller
141 return $RecValSegment;
144 # add function to be called to filter returned recommendation list
147 # save filter function name
148 $this->FilterFuncs[] = $FunctionName;
151 # return number of recommendations generated
157 # return recommendation generation time
163 # return list of items used to generate recommendation of specified item
166 # pull list of correlations from DB
167 $this->DB->Query(
"SELECT * FROM RecContentCorrelations, ".$this->RatingTableName
168 .
" WHERE (ItemIdA = ${RecommendedItemId}"
169 .
" OR ItemIdB = ${RecommendedItemId})"
170 .
" AND ".$this->UserIdFieldName.
" = ".$UserId
171 .
" AND (RecContentCorrelations.ItemIdA = ".$this->RatingTableName.
".".$this->ItemIdFieldName
172 .
" OR RecContentCorrelations.ItemIdB = ".$this->RatingTableName.
".".$this->ItemIdFieldName.
")"
173 .
" AND Rating >= 50 "
174 .
" ORDER BY Correlation DESC");
176 # for each correlation
177 $SourceList = array();
178 while ($Row = $this->DB->FetchRow())
180 # pick out appropriate item ID
181 if ($Row[
"ItemIdA"] == $RecommendedItemId)
183 $ItemId = $Row[
"ItemIdB"];
187 $ItemId = $Row[
"ItemIdA"];
190 # add item to recommendation source list
191 $SourceList[$ItemId] = $Row[
"Correlation"];
194 # return recommendation source list to caller
198 # dynamically generate and return list of items similar to specified item
201 if ($this->
DebugLevel > 1) { print(
"REC: searching for items similar to item \"".$ItemId.
"\"<br>\n"); }
203 # make sure we have item IDs available
206 # start with empty array
207 $SimilarItems = array();
210 foreach ($this->ItemIds as $Id)
212 # if item is not specified item
215 # calculate correlation of item to specified item
218 # if correlation is above threshold
219 if ($Correlation > $this->ContentCorrelationThreshold)
221 # add item to list of similar items
222 $SimilarItems[$Id] = $Correlation;
226 if ($this->
DebugLevel > 3) { print(
"REC: ".count($SimilarItems).
" similar items to item \"".$ItemId.
"\" found<br>\n"); }
228 # filter list of similar items (if any)
229 if (count($SimilarItems) > 0)
232 if ($this->
DebugLevel > 4) { print(
"REC: ".count($SimilarItems).
" similar items to item \"".$ItemId.
"\" left after filtering<br>\n"); }
235 # if any similar items left
236 if (count($SimilarItems) > 0)
238 # sort list of similar items in order of most to least similar
239 arsort($SimilarItems, SORT_NUMERIC);
242 # return list of similar items to caller
243 return $SimilarItems;
246 # dynamically generate and return list of recommended field values for item
249 if ($this->
DebugLevel > 1) { print(
"REC: generating field value recommendations for item \"".$ItemId.
"\"<br>\n"); }
251 # start with empty array of values
254 # generate list of similar items
257 # if similar items found
258 if (count($SimilarItems) > 0)
260 # prune list of similar items to only top third of better-than-average
261 $AverageCorr = intval(array_sum($SimilarItems) / count($SimilarItems));
262 reset($SimilarItems);
263 $HighestCorr = current($SimilarItems);
264 $CorrThreshold = intval($HighestCorr - (($HighestCorr - $AverageCorr) / 3));
265 if ($this->
DebugLevel > 8) { print(
"REC: <i>Average Correlation: $AverageCorr Highest Correlation: $HighestCorr Correlation Threshold: $CorrThreshold </i><br>\n"); }
266 foreach ($SimilarItems as $ItemId => $ItemCorr)
268 if ($ItemCorr < $CorrThreshold)
270 unset($SimilarItems[$ItemId]);
273 if ($this->
DebugLevel > 6) { print(
"REC: ".count($SimilarItems).
" similar items left after threshold pruning<br>\n"); }
276 foreach ($SimilarItems as $SimItemId => $SimItemCorr)
279 foreach ($this->ContentFields as $FieldName => $FieldAttributes)
281 # load field data for this item
282 $FieldData = $this->GetFieldValue($SimItemId, $FieldName);
284 # if field data is array
285 if (is_array($FieldData))
287 # for each field data value
288 foreach ($FieldData as $FieldDataVal)
290 # if data value is not empty
291 $FieldDataVal = trim($FieldDataVal);
292 if (strlen($FieldDataVal) > 0)
294 # increment count for data value
295 $RecVals[$FieldName][$FieldDataVal]++;
301 # if data value is not empty
302 $FieldData = trim($FieldData);
303 if (strlen($FieldData) > 0)
305 # increment count for data value
306 $RecVals[$FieldName][$FieldData]++;
313 $MatchingCountThreshold = 3;
314 foreach ($RecVals as $FieldName => $FieldVals)
316 # determine cutoff threshold
317 arsort($FieldVals, SORT_NUMERIC);
319 $HighestCount = current($FieldVals);
320 $AverageCount = intval(array_sum($FieldVals) / count($FieldVals));
321 $CountThreshold = intval($AverageCount + (($HighestCount - $AverageCount) / 2));
322 if ($CountThreshold < $MatchingCountThreshold) { $CountThreshold = $MatchingCountThreshold; }
323 if ($this->
DebugLevel > 8) { print(
"REC: <i>Field: $FieldName Average Count: $AverageCount Highest Count: $HighestCount Count Threshold: $CountThreshold </i><br>\n"); }
325 # for each field data value
326 foreach ($FieldVals as $FieldVal => $FieldValCount)
328 # if value count is below threshold
329 if ($FieldValCount < $CountThreshold)
332 unset($RecVals[$FieldName][$FieldVal]);
336 if ($this->
DebugLevel > 3) { print(
"REC: found ".count($RecVals[$FieldName]).
" recommended values for field \"".$FieldName.
"\" after threshold pruning<br>\n"); }
340 # return recommended values to caller
345 # ---- database update methods
349 if ($this->
DebugLevel > 0) { print(
"REC: UpdateForItems(${StartingItemId}, ${NumberOfItems})<br>\n"); }
350 # make sure we have item IDs available
356 foreach ($this->ItemIds as $ItemId)
358 # if item ID is within requested range
359 if ($ItemId >= $StartingItemId)
361 # update recommender info for item
362 if ($this->
DebugLevel > 1) { print(
"REC: doing item ${ItemId}<br>\n"); }
366 # if we have done requested number of items
367 if ($ItemsUpdated >= $NumberOfItems)
370 if ($this->
DebugLevel > 1) { print(
"REC: bailing out with item ${ItemId}<br>\n"); }
376 # return ID of last resource updated to caller
382 if ($this->
DebugLevel > 1) { print(
"REC: updating for item \"".$ItemId.
"\"<br>\n"); }
384 # make sure we have item IDs available
387 # clear existing correlations for this item
388 $this->DB->Query(
"DELETE FROM RecContentCorrelations "
389 .
"WHERE ItemIdA = ${ItemId}");
392 foreach ($this->ItemIds as $Id)
394 # if full pass and item is later in list than current item
395 if (($FullPass == FALSE) || ($Id > $ItemId))
397 # update correlation value for item and target item
405 # drop all correlation entries referring to item
406 $this->DB->Query(
"DELETE FROM RecContentCorrelations "
407 .
"WHERE ItemIdA = ".$ItemId.
" "
408 .
"OR ItemIdB = ".$ItemId);
413 # get average correlation
414 $AverageCorrelation = $this->DB->Query(
"SELECT AVG(Correlation) "
415 .
"AS Average FROM RecContentCorrelations",
"Average");
417 # dump all below-average correlations
418 if ($AverageCorrelation > 0)
420 $this->DB->Query(
"DELETE FROM RecContentCorrelations "
421 .
"WHERE Correlation <= ${AverageCorrelation}");
434 $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
" AS Id FROM "
435 .$this->ItemTableName.
" ORDER BY ".$this->ItemIdFieldName);
436 $ItemIds = $this->DB->FetchColumn(
"Id");
442 # ---- PRIVATE INTERFACE -------------------------------------------------
461 # if item IDs not already loaded
462 if (!isset($this->ItemIds))
464 # load item IDs from DB
465 $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
" AS Id FROM "
466 .$this->ItemTableName.
" ORDER BY ".$this->ItemIdFieldName);
467 $this->ItemIds = array();
468 while ($Item = $this->DB->FetchRow())
470 $this->ItemIds[] = $Item[
"Id"];
479 # if data not already loaded
480 if (!isset($ItemData[$ItemId][$FieldName]))
482 # load field value from DB
483 $FieldValue = $this->GetFieldValue($ItemId, $FieldName);
485 # if field value is array
486 if (is_array($FieldValue))
488 # concatenate together text from array elements
489 $FieldValue = implode(
" ", $FieldValue);
492 # normalize text and break into word array
496 # return cached data to caller
497 return $ItemData[$ItemId][$FieldName];
500 # calculate content correlation between two items and return value to caller
503 static $CorrelationCache;
505 if ($this->
DebugLevel > 10) { print(
"REC: calculating correlation"
506 .
" between items $ItemIdA and $ItemIdB<br>\n"); }
508 # order item ID numbers
509 if ($ItemIdA > $ItemIdB)
516 # if we already have the correlation
517 if (isset($CorrelationCache[$ItemIdA][$ItemIdB]))
519 # retrieve correlation from cache
520 $TotalCorrelation = $CorrelationCache[$ItemIdA][$ItemIdB];
524 # if list of fields to correlate specified
525 if ($FieldList != NULL)
527 # create list with only specified fields
528 foreach ($FieldList as $FieldName)
539 # for each content field
540 $TotalCorrelation = 0;
543 # if field is of a type that we use for correlation
544 $FieldType = intval($FieldAttributes[
"FieldType"]);
551 if ($this->
DebugLevel > 15) { print(
"REC: loaded ".count($ItemAData).
" terms for item #".$ItemIdA.
" and ".count($ItemBData).
" terms for item #".$ItemIdB.
" for field \"".$FieldName.
"\"<br>\n"); }
553 # call appropriate routine to get correlation
559 $ItemAData, $ItemBData);
563 # add correlation multiplied by weight to total
564 $TotalCorrelation += $Correlation * $FieldAttributes[
"Weight"];
568 # store correlation to cache
569 $CorrelationCache[$ItemIdA][$ItemIdB] = $TotalCorrelation;
572 # return correlation value to caller
573 if ($this->
DebugLevel > 9) { print(
"REC: correlation between items $ItemIdA and $ItemIdB found to be $TotalCorrelation<br>\n"); }
574 return $TotalCorrelation;
577 # calculate content correlation between two items and update in DB
580 if ($this->
DebugLevel > 6) { print(
"REC: updating correlation between items $ItemIdA and $ItemIdB<br>\n"); }
582 # bail out if two items are the same
583 if ($ItemIdA == $ItemIdB) {
return; }
585 # calculate correlation
588 # save new correlation
662 # strip any HTML tags
663 $Text = strip_tags($Text);
665 # strip any punctuation
666 $Text = preg_replace(
"/,\\.\\?-\\(\\)\\[\\]\"/",
" ", $Text); #
"
668 # normalize whitespace
669 $Text = trim(preg_replace("/[\\s]+/
", " ", $Text));
671 # convert to all lower case
672 $Text = strtolower($Text);
674 # split text into arrays of words
675 $Words = explode(" ", $Text);
677 # filter out all stop words
678 $Words = array_diff($Words, $StopWords);
680 # return word array to caller
684 function CalcTextCorrelation($WordsA, $WordsB)
686 # get array containing intersection of two word arrays
687 $IntersectWords = array_intersect($WordsA, $WordsB);
689 # return number of words remaining as score
690 return count($IntersectWords);
693 function ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation = -1)
695 # if item ID A is greater than item ID B
696 if ($ItemIdA > $ItemIdB)
704 # if new correlation value provided
705 if ($NewCorrelation != -1)
707 # if new value is above threshold
708 if ($NewCorrelation >= $this->ContentCorrelationThreshold)
710 # insert new correlation value in DB
711 $this->DB->Query("INSERT INTO RecContentCorrelations
"
712 ."(ItemIdA, ItemIdB, Correlation)
"
713 ."VALUES (${ItemIdA}, ${ItemIdB}, ${NewCorrelation})
");
715 # return correlation value is new value
716 $Correlation = $NewCorrelation;
721 # return value is zero
727 # retrieve correlation value from DB
728 $Correlation = $this->DB->Query(
729 "SELECT Correlation FROM RecContentCorrelations
"
730 ."WHERE ItemIdA = ${ItemIdA} AND ItemIdB = ${ItemIdB}
",
733 # if no value found in DB
734 if ($Correlation == FALSE)
736 # return value is zero
741 # return correlation value to caller
745 function FilterOnSuppliedFunctions($Results)
747 # if filter functions have been set
748 if (count($this->FilterFuncs) > 0)
751 foreach ($Results as $ResourceId => $Result)
753 # for each filter function
754 foreach ($this->FilterFuncs as $FuncName)
756 # if filter function return TRUE for result resource
757 if ($FuncName($ResourceId))
760 if ($this->DebugLevel > 2) { print("REC: filter callback rejected resource ${ResourceId}<br>\n
"); }
761 unset($Results[$ResourceId]);
763 # bail out of filter func loop
770 # return filtered list to caller
RecommendFieldValues($ItemId, $FieldList=NULL)
UpdateForItems($StartingItemId, $NumberOfItems)
GetSourceList($UserId, $RecommendedItemId)
ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation=-1)
const CONTENTFIELDTYPE_CONTROLLEDNAME
AddResultFilterFunction($FunctionName)
const CONTENTFIELDTYPE_DATE
FilterOnSuppliedFunctions($Results)
GetItemIds()
Retrieve all item IDs.
UpdateForItem($ItemId, $FullPass=FALSE)
const CONTENTFIELDTYPE_NUMERIC
GetFieldData($ItemId, $FieldName)
UpdateContentCorrelation($ItemIdA, $ItemIdB)
Recommend($UserId, $StartingResult=0, $NumberOfResults=10)
CalcTextCorrelation($WordsA, $WordsB)
const CONTENTFIELDTYPE_TEXT
Recommender(&$DB, $ItemTableName, $RatingTableName, $ItemIdFieldName, $UserIdFieldName, $RatingFieldName, $ContentFields)
FindSimilarItems($ItemId, $FieldList=NULL)
const CONTENTFIELDTYPE_DATERAMGE
$NumberOfResultsAvailable
NormalizeAndParseText($Text)
CalculateContentCorrelation($ItemIdA, $ItemIdB, $FieldList=NULL)
$ContentCorrelationThreshold