RSSClient.php
Go to the documentation of this file.
00001 <?PHP 00002 00003 # 00004 # FILE: Scout--RSSClient.php 00005 # 00006 # METHODS PROVIDED: 00007 # RSSClient() 00008 # - constructor 00009 # SomeMethod($SomeParameter, $AnotherParameter) 00010 # - short description of method 00011 # 00012 # AUTHOR: Edward Almasy 00013 # 00014 # Copyright 2005 Internet Scout Project 00015 # http://scout.wisc.edu 00016 # 00017 00018 class RSSClient { 00019 00020 # ---- PUBLIC INTERFACE -------------------------------------------------- 00021 00022 # object constructor 00023 function RSSClient($ServerUrl, $CacheDB = NULL, $RefreshTime = 600, $Encoding = "UTF-8", $DebugLevel = 0) 00024 { 00025 # set default debug level 00026 $this->DebugLevel = $DebugLevel; 00027 00028 # set default encoding 00029 $this->Encoding = $Encoding; 00030 00031 # save cache details 00032 $this->CacheDB = $CacheDB; 00033 $this->RefreshTime = $RefreshTime; 00034 00035 # query server (or cache) for XML text 00036 $this->XmlText = $this->QueryServerWithCaching( 00037 $ServerUrl, $CacheDB, $RefreshTime); 00038 00039 # create XML parser and parse text 00040 $this->Parser = new XMLParser($this->Encoding); 00041 if ($this->DebugLevel > 3) { $Parser->SetDebugLevel($this->DebugLevel - 3); } 00042 $this->Parser->ParseText($this->XmlText); 00043 00044 if ($this->DebugLevel) { print("RSSClient->RSSClient() returned ".strlen($this->XmlText)." characters from server query<br>\n"); } 00045 } 00046 00047 # get/set server URL 00048 function ServerUrl($NewValue = NULL) 00049 { 00050 # if new RSS server URL supplied 00051 if (($NewValue != NULL) && ($NewValue != $this->ServerUrl)) 00052 { 00053 # save new value 00054 $this->ServerUrl = $NewValue; 00055 00056 # re-read XML from server at new URL 00057 $this->XmlText = $this->QueryServerWithCaching( 00058 $NewValue, 00059 $this->CacheDB, 00060 $this->RefreshTime); 00061 00062 # create new XML parser and parse text 00063 $this->Parser = new XMLParser(); 00064 if ($this->DebugLevel > 3) { $Parser->SetDebugLevel($this->DebugLevel - 3); } 00065 $this->Parser->ParseText($this->XmlText); 00066 } 00067 00068 # return RSS server URL to caller 00069 return $this->ServerUrl; 00070 } 00071 00072 # get/set encoding 00073 function Encoding($NewValue = NULL) 00074 { 00075 # if new encoding supplied 00076 if (($NewValue != NULL) && ($NewValue != $this->Encoding)) 00077 { 00078 # save new value 00079 $this->Encoding = $NewValue; 00080 00081 # re-read XML from server 00082 $this->XmlText = $this->QueryServerWithCaching( 00083 $this->ServerUrl, 00084 $this->CacheDB, 00085 $this->RefreshTime); 00086 00087 # create new XML parser and parse text 00088 $this->Parser = new XMLParser($this->Encoding); 00089 if ($this->DebugLevel > 3) { $Parser->SetDebugLevel($this->DebugLevel - 3); } 00090 $this->Parser->ParseText($this->XmlText); 00091 } 00092 00093 # return encoding to caller 00094 return $this->Encoding; 00095 } 00096 00102 function AutodetectEncoding() 00103 { 00104 # if neither the XML file nor the HTTP response headers specify an 00105 # encoding, there is an overwhelming chance that it's ISO-8859-1, so 00106 # use it as the default 00107 $Encoding = "ISO-8859-1"; 00108 00109 # only get up to the the encoding portion of the XML declartion 00110 # http://www.w3.org/TR/2006/REC-xml-20060816/#sec-prolog-dtd 00111 $S = '[ \t\r\n]'; 00112 $Eq = "{$S}?={$S}?"; 00113 $VersionNum = '1.0'; 00114 $EncName = '[A-Za-z]([A-Za-z0-9._]|-)*'; 00115 $VersionInfo = "{$S}version{$Eq}('{$VersionNum}'|\"{$VersionNum}\")"; 00116 $EncodingDecl = "{$S}encoding{$Eq}('{$EncName}'|\"{$EncName}\")"; 00117 $XMLDecl = "<\?xml{$VersionInfo}({$EncodingDecl})?"; 00118 $RegEx = "/{$XMLDecl}/"; 00119 00120 # try to find the encoding, index 3 will be set if encoding is declared 00121 preg_match($RegEx, $this->XmlText, $Matches); 00122 00123 # give precedence to the encoding specified within the XML file since 00124 # a RSS feed publisher might not have access to HTTP response headers 00125 if (count($Matches) >= 4) 00126 { 00127 # also need to strip off the quotes 00128 $Encoding = trim($Matches[3], "'\""); 00129 } 00130 00131 # then give precedence to the charset parameter in the Content-Type 00132 # response header 00133 else if ($this->CacheDB) 00134 { 00135 # create cache table if it doesn't exist 00136 $DB = $this->CacheDB; 00137 $ServerUrl = addslashes($this->ServerUrl); 00138 00139 # get the cache value 00140 $DB->Query(" 00141 SELECT * FROM RSSClientCache 00142 WHERE ServerUrl = '".$ServerUrl."'"); 00143 $Exists = ($DB->NumRowsSelected() > 0); 00144 $Cache = $DB->FetchRow(); 00145 00146 # if cached and charset parameter was given in the response headers 00147 if ($Exists && strlen($Cache["Charset"])) 00148 { 00149 $Encoding = $Cache["Charset"]; 00150 } 00151 } 00152 00153 $this->Encoding($Encoding); 00154 } 00155 00156 # retrieve RSS items (from first channel if not otherwise specified) 00157 function GetItems($NumberOfItems = NULL, $ChannelName = NULL) 00158 { 00159 # start by assuming no items will be found 00160 $Items = array(); 00161 00162 # move parser to area in XML with items 00163 $Parser = $this->Parser; 00164 $Parser->SeekToRoot(); 00165 $Result = $Parser->SeekTo("rss"); 00166 if ($Result === NULL) 00167 { 00168 $Result = $Parser->SeekTo("rdf:RDF"); 00169 } 00170 else 00171 { 00172 $Parser->SeekTo("channel"); 00173 } 00174 00175 # if items are found 00176 $ItemCount = $Parser->SeekTo("item"); 00177 if ($ItemCount) 00178 { 00179 # for each record 00180 $Index = 0; 00181 do 00182 { 00183 # retrieve item info 00184 $Items[$Index]["title"] = $Parser->GetData("title"); 00185 $Items[$Index]["description"] = $Parser->GetData("description"); 00186 $Items[$Index]["link"] = $Parser->GetData("link"); 00187 $Items[$Index]["enclosure"] = $Parser->GetAttributes("enclosure"); 00188 00189 $Index++; 00190 } 00191 while ($Parser->NextItem() && (($NumberOfItems == NULL) || ($Index < $NumberOfItems))); 00192 } 00193 00194 # return records to caller 00195 return $Items; 00196 } 00197 00198 # retrieve site name as given in feed 00199 function GetChannelTitle() 00200 { 00201 if (!isset($this->ChannelTitle)) { $this->LoadChannelInfo(); } 00202 return $this->ChannelTitle; 00203 } 00204 00205 # retrieve site link as given in feed 00206 function GetChannelLink() 00207 { 00208 if (!isset($this->ChannelLink)) { $this->LoadChannelInfo(); } 00209 return $this->ChannelLink; 00210 } 00211 00212 # retrieve site description as given in feed 00213 function GetChannelDescription() 00214 { 00215 if (!isset($this->ChannelDescription)) { $this->LoadChannelInfo(); } 00216 return $this->ChannelDescription; 00217 } 00218 00219 # tell caller whether client is using cached data 00220 function UsedCachedData() 00221 { 00222 return $this->CachedDataWasUsed; 00223 } 00224 00225 00226 # ---- PRIVATE INTERFACE ------------------------------------------------- 00227 00228 var $CacheDB; 00229 var $RefreshTime; 00230 var $ServerUrl; 00231 var $MetadataPrefix; 00232 var $SetSpec; 00233 var $DebugLevel; 00234 var $Encoding; 00235 var $XmlText; 00236 var $Parser; 00237 var $ChannelTitle; 00238 var $ChannelLink; 00239 var $ChannelDescription; 00240 var $CachedDataWasUsed; 00241 00242 # set current debug output level (0-9) 00243 function SetDebugLevel($NewLevel) 00244 { 00245 $this->DebugLevel = $NewLevel; 00246 } 00247 00257 function GetXmlInfo($Url) 00258 { 00259 $Text = @file_get_contents($Url); 00260 $Type = NULL; 00261 $Charset = NULL; 00262 00263 # get the type and charset if the fetch was successful 00264 if ($Text !== FALSE) 00265 { 00266 # this must come after file_get_contents() and before any other remote 00267 # fetching is done 00268 $Headers = $http_response_header; 00269 00270 # http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17 00271 $LWS = '([ \t]*|\r\n[ \t]+)'; 00272 $Token = '[!\x23-\x27*+-.\x30-\x39\x41-\x5A\x5E-\x7A|~]+'; 00273 $QuotedPair = '\\[\x00-\x7F]'; 00274 $QdText = "([^\\x00-\\x1F\\x7F\"]|{$LWS})"; 00275 $QuotedString = "\"({$QdText}|{$QuotedPair})*\""; 00276 $Value = "({$Token}|{$QuotedString})"; 00277 $Parameter = "{$Token}{$LWS}={$LWS}{$Value}"; 00278 00279 # these make the Content-Type regex specific to Content-Type 00280 # values with charset parameters in them, but make capturing 00281 # the charset much easier 00282 $BasicParameter = "(;{$LWS}{$Parameter})*"; 00283 $CharsetParameter = "(;{$LWS}charset{$LWS}={$LWS}{$Value})"; 00284 $ModParameter = "{$BasicParameter}{$CharsetParameter}{$BasicParameter}"; 00285 $MediaType = "({$Token}{$LWS}\\/{$LWS}{$Token}){$LWS}{$ModParameter}"; 00286 00287 # back to the spec 00288 $ContentType = "Content-Type{$LWS}:{$LWS}{$MediaType}{$LWS}"; 00289 $RegEx = "/^{$ContentType}$/i"; 00290 00291 foreach ($Headers as $Header) 00292 { 00293 preg_match($RegEx, $Header, $Matches); 00294 00295 if (isset($Matches[3]) && isset($Matches[19])) 00296 { 00297 $Type = $Matches[3]; 00298 $Charset = $Matches[19]; 00299 break; 00300 } 00301 } 00302 } 00303 00304 return array($Text, $Type, $Charset); 00305 } 00306 00307 # load RSS XML from server or cache 00308 function QueryServerWithCaching($ServerUrl, $CacheDB, $RefreshTime) 00309 { 00310 # save RSS server URL 00311 $this->ServerUrl = $ServerUrl; 00312 00313 # save caching info (if any) 00314 if ($CacheDB) 00315 { 00316 $this->CacheDB = $CacheDB; 00317 } 00318 00319 # if caching info was supplied 00320 if ($this->CacheDB) 00321 { 00322 $DB = $this->CacheDB; 00323 00324 # look up cached information for this server 00325 $QueryTimeCutoff = date("Y-m-d H:i:s", (time() - $RefreshTime)); 00326 $DB->Query(" 00327 SELECT * FROM RSSClientCache 00328 WHERE ServerUrl = '".addslashes($ServerUrl)."' 00329 AND LastQueryTime > '".$QueryTimeCutoff."'"); 00330 00331 # if we have cached info that has not expired 00332 if ($CachedXml = $DB->FetchField("CachedXml")) 00333 { 00334 # use cached info 00335 $QueryResult = $CachedXml; 00336 $this->CachedDataWasUsed = TRUE; 00337 } 00338 else 00339 { 00340 $this->CachedDataWasUsed = FALSE; 00341 00342 # query server for XML text 00343 list($Text, $Type, $Charset) = $this->GetXmlInfo($ServerUrl); 00344 $QueryResult = ""; 00345 00346 # if query was successful 00347 if ($Text !== FALSE) 00348 { 00349 $QueryResult = $Text; 00350 00351 # clear out any old cache entries 00352 $DB->Query(" 00353 DELETE FROM RSSClientCache 00354 WHERE ServerUrl = '".addslashes($ServerUrl)."'"); 00355 00356 # save info in cache 00357 $DB->Query(" 00358 INSERT INTO RSSClientCache 00359 (ServerUrl, CachedXml, Type, Charset, LastQueryTime) 00360 VALUES ( 00361 '".addslashes($ServerUrl)."', 00362 '".addslashes($Text)."', 00363 '".addslashes($Type)."', 00364 '".addslashes($Charset)."', 00365 NOW())"); 00366 } 00367 } 00368 } 00369 00370 # return query result to caller 00371 return $QueryResult; 00372 } 00373 00374 function LoadChannelInfo() 00375 { 00376 $Parser = $this->Parser; 00377 $Parser->SeekToRoot(); 00378 $Result = $Parser->SeekTo("rss"); 00379 if ($Result === NULL) 00380 { 00381 $Result = $Parser->SeekTo("rdf:RDF"); 00382 } 00383 $Parser->SeekTo("channel"); 00384 $this->ChannelTitle = $Parser->GetData("title"); 00385 $this->ChannelLink = $Parser->GetData("link"); 00386 $this->ChannelDescription = $Parser->GetData("description"); 00387 } 00388 }