Search:

CWIS Developers Documentation

  • Main Page
  • Classes
  • Files
  • File List
  • File Members

Recommender.php

Go to the documentation of this file.
00001 <?PHP
00002 
00003 #
00004 #   FILE:  SPT--Recommender.php
00005 #
00006 #   METHODS PROVIDED:
00007 #       Recommender()
00008 #           - constructor
00009 #       SomeMethod($SomeParameter, $AnotherParameter)
00010 #           - short description of method
00011 #
00012 #   AUTHOR:  Edward Almasy
00013 #
00014 #   Part of the Scout Portal Toolkit
00015 #   Copyright 2002-2004 Internet Scout Project
00016 #   http://scout.wisc.edu
00017 #
00018 
00019 class Recommender {
00020 
00021     # ---- PUBLIC INTERFACE --------------------------------------------------
00022     # define content field types
00023     const CONTENTFIELDTYPE_TEXT =  1;
00024     const CONTENTFIELDTYPE_NUMERIC =  2;
00025     const CONTENTFIELDTYPE_CONTROLLEDNAME =  3;
00026     const CONTENTFIELDTYPE_DATE =  4;
00027     const CONTENTFIELDTYPE_DATERAMGE =  5;
00028 
00029     # object constructor
00030     function Recommender(&$DB, $ItemTableName, $RatingTableName, 
00031             $ItemIdFieldName, $UserIdFieldName, $RatingFieldName,
00032             $ContentFields)
00033     {
00034         # set default parameters
00035         $this->ContentCorrelationThreshold = 1;
00036 
00037         # save database object
00038         $this->DB =& $DB;
00039 
00040         # save new configuration values
00041         $this->ItemTableName = $ItemTableName;
00042         $this->RatingTableName = $RatingTableName;
00043         $this->ItemIdFieldName = $ItemIdFieldName;
00044         $this->UserIdFieldName = $UserIdFieldName;
00045         $this->RatingFieldName = $RatingFieldName;
00046         $this->ContentFields = $ContentFields;
00047 
00048         # set default debug state
00049         $this->DebugLevel = 0;
00050     }
00051 
00052     # set level for debugging output
00053     function DebugLevel($Setting)
00054     {
00055         $this->DebugLevel = $Setting;
00056     }
00057 
00058 
00059     # ---- recommendation methods
00060 
00061     # recommend items for specified user
00062     function Recommend($UserId, $StartingResult = 0, $NumberOfResults = 10)
00063     {
00064         if ($this->DebugLevel > 0) {  print("REC:  Recommend(${UserId}, ${StartingResult}, ${NumberOfResults})<br>\n");  }
00065 
00066         # load in user ratings
00067         $Ratings = array();
00068         $DB =& $this->DB;
00069         $DB->Query("SELECT ".$this->ItemIdFieldName.", ".$this->RatingFieldName
00070                 ." FROM ".$this->RatingTableName
00071                 ." WHERE ".$this->UserIdFieldName." = ${UserId}");
00072         while ($Row = $DB->FetchRow())
00073         {
00074             $Ratings[$Row[$this->ItemIdFieldName]] = 
00075                     $Row[$this->RatingFieldName];
00076         }
00077         if ($this->DebugLevel > 1) {  print("REC:  user has rated ".count($Ratings)." items<br>\n");  }
00078 
00079         # for each item that user has rated
00080         $RecVals = array();
00081         foreach ($Ratings as $ItemId => $ItemRating)
00082         {
00083             # for each content correlation available for that item
00084             $DB->Query("SELECT Correlation, ItemIdB "
00085                     ."FROM RecContentCorrelations "
00086                     ."WHERE ItemIdA = ${ItemId}");
00087             while ($Row = $DB->FetchRow())
00088             {
00089                 # multiply that correlation by normalized rating and add
00090                 #       resulting value to recommendation value for that item
00091                 if (isset($RecVals[$Row["ItemIdB"]]))
00092                 {
00093                     $RecVals[$Row["ItemIdB"]] +=
00094                             $Row["Correlation"] * ($ItemRating - 50);
00095                 }
00096                 else
00097                 {
00098                     $RecVals[$Row["ItemIdB"]] =
00099                             $Row["Correlation"] * ($ItemRating - 50);
00100                 }
00101                 if ($this->DebugLevel > 9) {  print("REC:  RecVal[".$Row["ItemIdB"]."] = ".$RecVals[$Row["ItemIdB"]]."<br>\n");  }
00102             }
00103         }
00104         if ($this->DebugLevel > 1) {  print("REC:  found ".count($RecVals)." total recommendations<br>\n");  }
00105 
00106         # calculate average correlation between items
00107         $ResultThreshold = $DB->Query("SELECT AVG(Correlation) "
00108                 ."AS Average FROM RecContentCorrelations", "Average");
00109         $ResultThreshold = round($ResultThreshold) * 2;
00110 
00111         # for each recommended item
00112         foreach ($RecVals as $ItemId => $RecVal)
00113         {
00114             # remove item from list if user already rated it
00115             if (isset($Ratings[$ItemId]))
00116             {
00117                 unset($RecVals[$ItemId]);  
00118             }
00119             else
00120             {
00121                 # scale recommendation value back to match thresholds
00122                 $RecVals[$ItemId] = round($RecVal / 50);
00123 
00124                 # remove item from recommendation list if value is below threshold
00125                 if ($RecVals[$ItemId] < $ResultThreshold)
00126                 {  
00127                     unset($RecVals[$ItemId]);  
00128                 }
00129             }
00130         }
00131         if ($this->DebugLevel > 1) {  print("REC:  found ".count($RecVals)." positive recommendations<br>\n");  }
00132 
00133         # sort recommendation list by value
00134         if (isset($RecVals)) {  arsort($RecVals, SORT_NUMERIC);  }
00135 
00136         # save total number of results available
00137         $this->NumberOfResultsAvailable = count($RecVals);
00138 
00139         # trim result list to match range requested by caller
00140         $RecValKeys = array_slice(
00141                 array_keys($RecVals), $StartingResult, $NumberOfResults);
00142         $RecValSegment = array();
00143         foreach ($RecValKeys as $Key) 
00144         {  
00145             $RecValSegment[$Key] = $RecVals[$Key];  
00146         }
00147 
00148         # return recommendation list to caller
00149         return $RecValSegment;
00150     }
00151 
00152     # add function to be called to filter returned recommendation list
00153     function AddResultFilterFunction($FunctionName)
00154     {
00155         # save filter function name
00156         $this->FilterFuncs[] = $FunctionName;
00157     }
00158 
00159     # return number of recommendations generated
00160     function NumberOfResults()
00161     {
00162         return $this->NumberOfResultsAvailable;
00163     }
00164 
00165     # return recommendation generation time
00166     function SearchTime()
00167     {
00168         return $this->LastSearchTime;
00169     }
00170 
00171     # return list of items used to generate recommendation of specified item
00172     function GetSourceList($UserId, $RecommendedItemId)
00173     {
00174         # pull list of correlations from DB
00175         $this->DB->Query("SELECT * FROM RecContentCorrelations, ".$this->RatingTableName
00176                 ." WHERE (ItemIdA = ${RecommendedItemId}"
00177                         ." OR ItemIdB = ${RecommendedItemId})"
00178                         ." AND ".$this->UserIdFieldName." = ".$UserId
00179                         ." AND (RecContentCorrelations.ItemIdA = ".$this->RatingTableName.".".$this->ItemIdFieldName
00180                         ." OR RecContentCorrelations.ItemIdB = ".$this->RatingTableName.".".$this->ItemIdFieldName.")"
00181                         ." AND Rating >= 50 "
00182                 ." ORDER BY Correlation DESC");
00183 
00184         # for each correlation
00185         $SourceList = array();
00186         while ($Row = $this->DB->FetchRow())
00187         {
00188             # pick out appropriate item ID
00189             if ($Row["ItemIdA"] == $RecommendedItemId)
00190             {
00191                 $ItemId = $Row["ItemIdB"];
00192             }
00193             else
00194             {
00195                 $ItemId = $Row["ItemIdA"];
00196             }
00197 
00198             # add item to recommendation source list
00199             $SourceList[$ItemId] = $Row["Correlation"];
00200         }
00201 
00202         # return recommendation source list to caller
00203         return $SourceList;
00204     }
00205 
00206     # dynamically generate and return list of items similar to specified item
00207     function FindSimilarItems($ItemId, $FieldList = NULL)
00208     {
00209         if ($this->DebugLevel > 1) {  print("REC:  searching for items similar to item \"".$ItemId."\"<br>\n");  }
00210 
00211         # make sure we have item IDs available
00212         $this->LoadItemIds();
00213 
00214         # start with empty array
00215         $SimilarItems = array();
00216 
00217         # for every item
00218         foreach ($this->ItemIds as $Id)
00219         {
00220             # if item is not specified item
00221             if ($Id != $ItemId)
00222             {
00223                 # calculate correlation of item to specified item
00224                 $Correlation = $this->CalculateContentCorrelation($ItemId, $Id, $FieldList);
00225 
00226                 # if correlation is above threshold
00227                 if ($Correlation > $this->ContentCorrelationThreshold)
00228                 {
00229                     # add item to list of similar items
00230                     $SimilarItems[$Id] = $Correlation;
00231                 }
00232             }
00233         }
00234         if ($this->DebugLevel > 3) {  print("REC:  ".count($SimilarItems)." similar items to item \"".$ItemId."\" found<br>\n");  }
00235 
00236         # filter list of similar items (if any)
00237         if (count($SimilarItems) > 0)
00238         {
00239             $SimilarItems = $this->FilterOnSuppliedFunctions($SimilarItems);
00240             if ($this->DebugLevel > 4) {  print("REC:  ".count($SimilarItems)." similar items to item \"".$ItemId."\" left after filtering<br>\n");  }
00241         }
00242         
00243         # if any similar items left
00244         if (count($SimilarItems) > 0)
00245         {
00246             # sort list of similar items in order of most to least similar
00247             arsort($SimilarItems, SORT_NUMERIC);
00248         }
00249 
00250         # return list of similar items to caller
00251         return $SimilarItems;
00252     }
00253 
00254     # dynamically generate and return list of recommended field values for item
00255     function RecommendFieldValues($ItemId, $FieldList = NULL)
00256     {
00257         if ($this->DebugLevel > 1) {  print("REC:  generating field value recommendations for item \"".$ItemId."\"<br>\n");  }
00258 
00259         # start with empty array of values
00260         $RecVals = array();
00261 
00262         # generate list of similar items
00263         $SimilarItems = $this->FindSimilarItems($ItemId, $FieldList);
00264         
00265         # if similar items found
00266         if (count($SimilarItems) > 0)
00267         {
00268             # prune list of similar items to only top third of better-than-average
00269             $AverageCorr = intval(array_sum($SimilarItems) / count($SimilarItems));
00270             reset($SimilarItems);
00271             $HighestCorr = current($SimilarItems);
00272             $CorrThreshold = intval($HighestCorr - (($HighestCorr - $AverageCorr) / 3));
00273             if ($this->DebugLevel > 8) {  print("REC:  <i>Average Correlation: $AverageCorr &nbsp;&nbsp;&nbsp;&nbsp; Highest Correlation: $HighestCorr &nbsp;&nbsp;&nbsp;&nbsp; Correlation Threshold: $CorrThreshold </i><br>\n");  }
00274             foreach ($SimilarItems as $ItemId => $ItemCorr)
00275             {
00276                 if ($ItemCorr < $CorrThreshold)
00277                 {
00278                     unset($SimilarItems[$ItemId]);
00279                 }
00280             }
00281             if ($this->DebugLevel > 6) {  print("REC:  ".count($SimilarItems)." similar items left after threshold pruning<br>\n");  }
00282 
00283             # for each item
00284             foreach ($SimilarItems as $SimItemId => $SimItemCorr)
00285             {
00286                 # for each field
00287                 foreach ($this->ContentFields as $FieldName => $FieldAttributes)
00288                 {
00289                     # load field data for this item
00290                     $FieldData = $this->GetFieldValue($SimItemId, $FieldName);
00291 
00292                     # if field data is array
00293                     if (is_array($FieldData))
00294                     {
00295                         # for each field data value
00296                         foreach ($FieldData as $FieldDataVal)
00297                         {
00298                             # if data value is not empty
00299                             $FieldDataVal = trim($FieldDataVal);
00300                             if (strlen($FieldDataVal) > 0)
00301                             {
00302                                 # increment count for data value
00303                                 $RecVals[$FieldName][$FieldDataVal]++;
00304                             }
00305                         }
00306                     }
00307                     else
00308                     {
00309                         # if data value is not empty
00310                         $FieldData = trim($FieldData);
00311                         if (strlen($FieldData) > 0)
00312                         {
00313                             # increment count for data value
00314                             $RecVals[$FieldName][$FieldData]++;
00315                         }
00316                     }
00317                 }
00318             }
00319 
00320             # for each field
00321             $MatchingCountThreshold = 3;
00322             foreach ($RecVals as $FieldName => $FieldVals)
00323             {
00324                 # determine cutoff threshold
00325                 arsort($FieldVals, SORT_NUMERIC);
00326                 reset($FieldVals);
00327                 $HighestCount = current($FieldVals);
00328                 $AverageCount = intval(array_sum($FieldVals) / count($FieldVals));
00329                 $CountThreshold = intval($AverageCount + (($HighestCount - $AverageCount) / 2));
00330                 if ($CountThreshold < $MatchingCountThreshold) {  $CountThreshold = $MatchingCountThreshold;  }
00331                 if ($this->DebugLevel > 8) {  print("REC:  <i>Field: $FieldName &nbsp;&nbsp;&nbsp;&nbsp;  Average Count: $AverageCount &nbsp;&nbsp;&nbsp;&nbsp; Highest Count: $HighestCount &nbsp;&nbsp;&nbsp;&nbsp; Count Threshold: $CountThreshold </i><br>\n");  }
00332 
00333                 # for each field data value
00334                 foreach ($FieldVals as $FieldVal => $FieldValCount)
00335                 {
00336                     # if value count is below threshold
00337                     if ($FieldValCount < $CountThreshold)
00338                     {
00339                         # unset value
00340                         unset($RecVals[$FieldName][$FieldVal]);
00341                     }
00342                 }
00343 
00344                 if ($this->DebugLevel > 3) {  print("REC:  found ".count($RecVals[$FieldName])." recommended values for field \"".$FieldName."\" after threshold pruning<br>\n");  }
00345             }
00346         }
00347 
00348         # return recommended values to caller
00349         return $RecVals;
00350     }
00351 
00352 
00353     # ---- database update methods
00354 
00355     function UpdateForItems($StartingItemId, $NumberOfItems)
00356     {
00357         if ($this->DebugLevel > 0) {  print("REC:  UpdateForItems(${StartingItemId}, ${NumberOfItems})<br>\n");  }
00358         # make sure we have item IDs available
00359         $this->LoadItemIds();
00360 
00361         # for every item
00362         $ItemsUpdated = 0;
00363         $ItemId = NULL;
00364         foreach ($this->ItemIds as $ItemId)
00365         {
00366             # if item ID is within requested range
00367             if ($ItemId >= $StartingItemId)
00368             {
00369                 # update recommender info for item
00370                 if ($this->DebugLevel > 1) {  print("REC:  doing item ${ItemId}<br>\n");  }
00371                 $this->UpdateForItem($ItemId, TRUE);
00372                 $ItemsUpdated++;
00373 
00374                 # if we have done requested number of items
00375                 if ($ItemsUpdated >= $NumberOfItems)
00376                 {
00377                     # bail out
00378                     if ($this->DebugLevel > 1) {  print("REC:  bailing out with item ${ItemId}<br>\n");  }
00379                     return $ItemId;
00380                 }
00381             }
00382         }
00383 
00384         # return ID of last resource updated to caller
00385         return $ItemId;
00386     }
00387 
00388     function UpdateForItem($ItemId, $FullPass = FALSE)
00389     {   
00390         if ($this->DebugLevel > 1) {  print("REC:  updating for item \"".$ItemId."\"<br>\n");  }
00391 
00392         # make sure we have item IDs available
00393         $this->LoadItemIds();
00394 
00395         # clear existing correlations for this item
00396         $this->DB->Query("DELETE FROM RecContentCorrelations "
00397                 ."WHERE ItemIdA = ${ItemId}");
00398 
00399         # for every item
00400         foreach ($this->ItemIds as $Id)
00401         {
00402             # if full pass and item is later in list than current item
00403             if (($FullPass == FALSE) || ($Id > $ItemId))
00404             {
00405                 # update correlation value for item and target item
00406                 $this->UpdateContentCorrelation($ItemId, $Id);
00407             }
00408         }
00409     }
00410 
00411     function DropItem($ItemId)
00412     {
00413         # drop all correlation entries referring to item
00414         $this->DB->Query("DELETE FROM RecContentCorrelations "
00415                          ."WHERE ItemIdA = ".$ItemId." "
00416                             ."OR ItemIdB = ".$ItemId);
00417     }
00418 
00419     function PruneCorrelations()
00420     {
00421         # get average correlation
00422         $AverageCorrelation = $this->DB->Query("SELECT AVG(Correlation) "
00423                 ."AS Average FROM RecContentCorrelations", "Average");
00424 
00425         # dump all below-average correlations
00426         if ($AverageCorrelation > 0)
00427         {
00428             $this->DB->Query("DELETE FROM RecContentCorrelations "
00429                     ."WHERE Correlation <= ${AverageCorrelation}");
00430         }
00431     }
00432 
00437     function GetItemIds()
00438     {
00439         static $ItemIds;
00440         if (!isset($ItemIds))
00441         {
00442             $this->DB->Query("SELECT ".$this->ItemIdFieldName." AS Id FROM "
00443                     .$this->ItemTableName." ORDER BY ".$this->ItemIdFieldName);
00444             $ItemIds = $this->DB->FetchColumn("Id");
00445         }
00446         return $ItemIds;
00447     }
00448 
00449 
00450     # ---- PRIVATE INTERFACE -------------------------------------------------
00451 
00452     var $ContentCorrelationThreshold;
00453     var $ContentFields;
00454     var $ItemTableName;
00455     var $RatingTableName;
00456     var $ItemIdFieldName;
00457     var $UserIdFieldName;
00458     var $RatingFieldName;
00459     var $ItemIds;
00460     var $DB;
00461     var $FilterFuncs;
00462     var $LastSearchTime;
00463     var $NumberOfResultsAvailable;
00464     var $DebugLevel;
00465 
00466 
00467     function LoadItemIds()
00468     {
00469         # if item IDs not already loaded
00470         if (!isset($this->ItemIds))
00471         {
00472             # load item IDs from DB
00473             $this->DB->Query("SELECT ".$this->ItemIdFieldName." AS Id FROM "
00474                     .$this->ItemTableName." ORDER BY ".$this->ItemIdFieldName);
00475             $this->ItemIds = array();
00476             while ($Item = $this->DB->FetchRow())
00477             {
00478                 $this->ItemIds[] = $Item["Id"];
00479             }
00480         }
00481     }
00482 
00483     function GetFieldData($ItemId, $FieldName)
00484     {
00485         static $ItemData;
00486         static $CachedItemList;
00487 
00488         # if data not already loaded
00489         if (!isset($ItemData[$ItemId][$FieldName]))
00490         {
00491             # load field value from DB
00492             $FieldValue = $this->GetFieldValue($ItemId, $FieldName);
00493 
00494             # if field value is array
00495             if (is_array($FieldValue))
00496             {
00497                 # concatenate together text from array elements
00498                 $FieldValue = implode(" ", $FieldValue);
00499             }
00500 
00501             # normalize text and break into word array
00502             $ItemData[$ItemId][$FieldName] = $this->NormalizeAndParseText($FieldValue);
00503 
00504             # if more items than cache limit
00505             if (count($ItemData) > 1000)
00506             {
00507                 # dump oldest item
00508                 reset($ItemData);
00509                 list($DumpedItemId, $DumpedItemData) = each($ItemData);
00510                 unset($ItemData[$DumpedItemId]);
00511             }
00512         }
00513 
00514         # return cached data to caller
00515         return $ItemData[$ItemId][$FieldName];
00516     }
00517 
00518     # calculate content correlation between two items and return value to caller
00519     function CalculateContentCorrelation($ItemIdA, $ItemIdB, $FieldList = NULL)
00520     {
00521         static $CorrelationCache;
00522         
00523         if ($this->DebugLevel > 10) {  print("REC:  calculating correlation between items $ItemIdA and $ItemIdB<br>\n");  }
00524         
00525         # order item ID numbers
00526         if ($ItemIdA > $ItemIdB)
00527         {
00528             $Temp = $ItemIdA;
00529             $ItemIdA = $ItemIdB;
00530             $ItemIdB = $Temp;
00531         }
00532         
00533         # if we already have the correlation
00534         if (isset($CorrelationCache[$ItemIdA][$ItemIdB]))
00535         {
00536             # retrieve correlation from cache
00537             $TotalCorrelation = $CorrelationCache[$ItemIdA][$ItemIdB];
00538         }
00539         else
00540         {
00541             # if list of fields to correlate specified
00542             if ($FieldList != NULL)
00543             {
00544                 # create list with only specified fields
00545                 foreach ($FieldList as $FieldName)
00546                 {
00547                     $ContentFields[$FieldName] = $this->ContentFields[$FieldName];
00548                 }
00549             }
00550             else
00551             {
00552                 # use all fields
00553                 $ContentFields = $this->ContentFields;
00554             }
00555 
00556             # for each content field
00557             $TotalCorrelation = 0;
00558             foreach ($ContentFields as $FieldName => $FieldAttributes)
00559             {
00560                 # if field is of a type that we use for correlation
00561                 $FieldType = intval($FieldAttributes["FieldType"]);
00562                 if (($FieldType == Recommender::CONTENTFIELDTYPE_TEXT)
00563                         || ($FieldType == Recommender::CONTENTFIELDTYPE_CONTROLLEDNAME))
00564                 {
00565                     # load data
00566                     $ItemAData = $this->GetFieldData($ItemIdA, $FieldName);
00567                     $ItemBData = $this->GetFieldData($ItemIdB, $FieldName);
00568                     if ($this->DebugLevel > 15) {  print("REC:  loaded ".count($ItemAData)." terms for item #".$ItemIdA." and ".count($ItemBData)." terms for item #".$ItemIdB." for field \"".$FieldName."\"<br>\n");  }
00569 
00570                     # call appropriate routine to get correlation
00571                     switch ($FieldType)
00572                     {
00573                         case Recommender::CONTENTFIELDTYPE_TEXT:
00574                         case Recommender::CONTENTFIELDTYPE_CONTROLLEDNAME:
00575                             $Correlation = $this->CalcTextCorrelation(
00576                                     $ItemAData, $ItemBData);
00577                             break;
00578                     }
00579 
00580                     # add correlation multiplied by weight to total
00581                     $TotalCorrelation += $Correlation * $FieldAttributes["Weight"];
00582                 }
00583             }
00584             
00585             # store correlation to cache
00586             $CorrelationCache[$ItemIdA][$ItemIdB] = $TotalCorrelation;
00587         }
00588 
00589         # return correlation value to caller
00590         if ($this->DebugLevel > 9) {  print("REC:  correlation between items $ItemIdA and $ItemIdB found to be $TotalCorrelation<br>\n");  }
00591         return $TotalCorrelation;
00592     }
00593 
00594     # calculate content correlation between two items and update in DB
00595     function UpdateContentCorrelation($ItemIdA, $ItemIdB)
00596     {
00597         if ($this->DebugLevel > 6) {  print("REC:  updating correlation between items $ItemIdA and $ItemIdB<br>\n");  }
00598 
00599         # bail out if two items are the same
00600         if ($ItemIdA == $ItemIdB) {  return;  }
00601 
00602         # calculate correlation
00603         $Correlation = $this->CalculateContentCorrelation($ItemIdA, $ItemIdB);
00604 
00605         # save new correlation
00606         $this->ContentCorrelation($ItemIdA, $ItemIdB, $Correlation);
00607     }
00608 
00609     function NormalizeAndParseText($Text)
00610     {
00611         $StopWords = array(
00612                 "a",
00613                 "about",
00614                 "also",
00615                 "an",
00616                 "and",
00617                 "are",
00618                 "as",
00619                 "at",
00620                 "be",
00621                 "but",
00622                 "by",
00623                 "can",
00624                 "each",
00625                 "either",
00626                 "for",
00627                 "from",
00628                 "has",
00629                 "he",
00630                 "her",
00631                 "here",
00632                 "hers",
00633                 "him",
00634                 "his",
00635                 "how",
00636                 "i",
00637                 "if",
00638                 "in",
00639                 "include",
00640                 "into",
00641                 "is",
00642                 "it",
00643                 "its",
00644                 "me",
00645                 "neither",
00646                 "no",
00647                 "nor",
00648                 "not",
00649                 "of",
00650                 "on",
00651                 "or",
00652                 "so",
00653                 "she",
00654                 "than",
00655                 "that",
00656                 "the",
00657                 "their",
00658                 "them",
00659                 "then",
00660                 "there",
00661                 "these",
00662                 "they",
00663                 "this",
00664                 "those",
00665                 "through",
00666                 "to",
00667                 "too",
00668                 "very",
00669                 "what",
00670                 "when",
00671                 "where",
00672                 "while",
00673                 "who",
00674                 "why",
00675                 "will",
00676                 "you",
00677                 "");
00678 
00679         # strip any HTML tags
00680         $Text = strip_tags($Text);
00681 
00682         # strip any punctuation
00683         $Text = preg_replace("/,\\.\\?-\\(\\)\\[\\]\"/", " ", $Text);   # "
00684 
00685         # normalize whitespace
00686         $Text = trim(preg_replace("/[\\s]+/", " ", $Text));
00687 
00688         # convert to all lower case
00689         $Text = strtolower($Text);
00690 
00691         # split text into arrays of words
00692         $Words = explode(" ", $Text);
00693 
00694         # filter out all stop words
00695         $Words = array_diff($Words, $StopWords);
00696 
00697         # return word array to caller
00698         return $Words;
00699     }
00700 
00701     function CalcTextCorrelation($WordsA, $WordsB)
00702     {
00703         # get array containing intersection of two word arrays
00704         $IntersectWords = array_intersect($WordsA, $WordsB);
00705 
00706         # return number of words remaining as score
00707         return count($IntersectWords);
00708     }
00709 
00710     function ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation = -1)
00711     {
00712         # if item ID A is greater than item ID B
00713         if ($ItemIdA > $ItemIdB)
00714         {
00715             # swap item IDs
00716             $Temp = $ItemIdA;
00717             $ItemIdA = $ItemIdB;
00718             $ItemIdB = $Temp;
00719         }
00720 
00721         # if new correlation value provided
00722         if ($NewCorrelation != -1)
00723         {
00724             # if new value is above threshold
00725             if ($NewCorrelation >= $this->ContentCorrelationThreshold)
00726             {
00727                 # insert new correlation value in DB
00728                 $this->DB->Query("INSERT INTO RecContentCorrelations "
00729                         ."(ItemIdA, ItemIdB, Correlation) "
00730                         ."VALUES (${ItemIdA}, ${ItemIdB}, ${NewCorrelation})");
00731 
00732                 # return correlation value is new value
00733                 $Correlation = $NewCorrelation;
00734             }
00735             # else
00736             else
00737             {
00738                 # return value is zero
00739                 $Correlation = 0;
00740             }
00741         }
00742         else
00743         {
00744             # retrieve correlation value from DB
00745             $Correlation = $this->DB->Query(
00746                     "SELECT Correlation FROM RecContentCorrelations "
00747                             ."WHERE ItemIdA = ${ItemIdA} AND ItemIdB = ${ItemIdB}",
00748                     "Correlation");
00749 
00750             # if no value found in DB
00751             if ($Correlation == FALSE)
00752             {
00753                 # return value is zero
00754                 $Correlation = 0;
00755             }
00756         }
00757 
00758         # return correlation value to caller
00759         return $Correlation;
00760     }
00761 
00762     function FilterOnSuppliedFunctions($Results)
00763     {
00764         # if filter functions have been set
00765         if (count($this->FilterFuncs) > 0)
00766         {
00767             # for each result
00768             foreach ($Results as $ResourceId => $Result)
00769             {
00770                 # for each filter function
00771                 foreach ($this->FilterFuncs as $FuncName)
00772                 {
00773                     # if filter function return TRUE for result resource
00774                     if ($FuncName($ResourceId))
00775                     {
00776                         # discard result
00777                         if ($this->DebugLevel > 2) {  print("REC:      filter callback rejected resource ${ResourceId}<br>\n");  }
00778                         unset($Results[$ResourceId]);
00779 
00780                         # bail out of filter func loop
00781                         continue 2;
00782                     }
00783                 }
00784             }
00785         }
00786 
00787         # return filtered list to caller
00788         return $Results;
00789     }
00790 }
00791 
00792 ?>

CWIS logo doxygen
Copyright 2010 Internet Scout