00001 <?PHP 00002 00003 # 00004 # FILE: Scout--OAIClient.php 00005 # Provides a client for pulling data from OAI-PMH providers 00006 # For protocol documentation, see: 00007 # http://www.openarchives.org/OAI/openarchivesprotocol.html 00008 # 00009 # METHODS PROVIDED: 00010 # OAIClient(ServerUrl, Cache) 00011 # - constructor 00012 # ServerUrl(NewValue) 00013 # - Change the base url of the remote repository 00014 # MetadataPrefix($pfx) 00015 # - Set the schema we will request from remote 00016 # SetSpec($set) 00017 # - Restrict queries to a single set 00018 # for details, see 00019 # http://www.openarchives.org/OAI/openarchivesprotocol.html#Set 00020 # GetIdentification() 00021 # - Fetch identifying information about the remote repository 00022 # GetFormats() 00023 # - Fetch information about what schemas remote can serve 00024 # GetRecords($start,$end) 00025 # - Pull records in batches, optionally with date restrictions 00026 # GetRecord($id) 00027 # - Pull a single record using a unique identifier 00028 # MoreRecordsAvailable() 00029 # - Determine if a batch pull is complete or not 00030 # ResetRecordPointer() 00031 # - Restart a batch pull from the beginning 00032 # SetDebugLevel() 00033 # - Determine verbosity 00034 # 00035 # Copyright 2008 Edward Almasy and Internet Scout 00036 # http://scout.wisc.edu 00037 # 00038 00039 require_once("Scout--XMLParser.php"); 00040 00041 00042 class OAIClient { 00043 00044 # ---- PUBLIC INTERFACE -------------------------------------------------- 00045 00052 function OAIClient($ServerUrl, $Cache=NULL) 00053 { 00054 # set default debug level 00055 $this->DebugLevel = 0; 00056 00057 # save OAI server URL 00058 $this->ServerUrl = $ServerUrl; 00059 00060 # set default metadata prefix 00061 $this->MetadataPrefix = "oai_dc"; 00062 00063 # set default set specification for queries 00064 $this->SetSpec = NULL; 00065 00066 $this->CacheSequenceNumber = 0; 00067 if ($Cache !== NULL) 00068 { 00069 $this->Cache = $Cache; 00070 $this->UsingCache = is_dir($Cache); 00071 if ($this->UsingCache == FALSE ) 00072 { 00073 mkdir($Cache); 00074 } 00075 } 00076 } 00077 00084 function ServerUrl($NewValue = NULL) 00085 { 00086 if ($NewValue != NULL) 00087 { 00088 $this->ServerUrl = $NewValue; 00089 } 00090 return $this->ServerUrl; 00091 } 00092 00099 function MetadataPrefix($NewValue = NULL) 00100 { 00101 if ($NewValue != NULL) 00102 { 00103 $this->MetadataPrefix = $NewValue; 00104 } 00105 return $this->MetadataPrefix; 00106 } 00107 00114 function SetSpec($NewValue = "X-NOSETSPECVALUE-X") 00115 { 00116 if ($NewValue != "X-NOSETSPECVALUE-X") 00117 { 00118 $this->SetSpec = $NewValue; 00119 } 00120 return $this->SetSpec; 00121 } 00122 00130 function GetIdentification() 00131 { 00132 # query server for XML text 00133 $XmlText = $this->PerformQuery("Identify"); 00134 $this->DebugOutVar(8,__METHOD__,"XmlText",htmlspecialchars($XmlText)); 00135 00136 # convert XML text into object 00137 $Xml = simplexml_load_string($XmlText); 00138 $this->DebugOutVar(9, __METHOD__, "Xml", $Xml); 00139 00140 # if identification info was found 00141 $Info = array(); 00142 if (isset($Xml->Identify)) 00143 { 00144 # extract info 00145 $Ident = $Xml->Identify; 00146 $this->GetValFromXml($Ident, "repositoryName", "Name", $Info); 00147 $this->GetValFromXml($Ident, "adminEmail", "Email", $Info); 00148 $this->GetValFromXml($Ident, "baseURL", "URL", $Info); 00149 } 00150 00151 # return info to caller 00152 return $Info; 00153 } 00154 00160 function GetFormats() 00161 { 00162 # query server for XML text 00163 $XmlText = $this->PerformQuery("ListMetadataFormats"); 00164 $this->DebugOutVar(8,__METHOD__,"XmlText",htmlspecialchars($XmlText)); 00165 00166 # convert XML text into object 00167 $Xml = simplexml_load_string($XmlText); 00168 $this->DebugOutVar(9, __METHOD__, "Xml", $Xml); 00169 00170 # if format info was found 00171 $Formats = array(); 00172 if (isset($Xml->ListMetadataFormats->metadataFormat)) 00173 { 00174 # extract info 00175 $Index = 0; 00176 foreach ($Xml->ListMetadataFormats->metadataFormat as $Format) 00177 { 00178 $this->GetValFromXml( 00179 $Format, "metadataPrefix", "Name", $Formats[$Index]); 00180 $this->GetValFromXml( 00181 $Format, "schema", "Schema", $Formats[$Index]); 00182 $this->GetValFromXml( 00183 $Format, "metadataNamespace", "Namespace", 00184 $Formats[$Index]); 00185 $Index++; 00186 } 00187 } 00188 00189 # return info to caller 00190 return $Formats; 00191 } 00192 00200 function GetRecords($StartDate = NULL, $EndDate = NULL) 00201 { 00202 if( $this->Cache != NULL ) 00203 { 00204 $cache_fname = sprintf("%s/%010x", 00205 $this->Cache, 00206 $this->CacheSequenceNumber); 00207 $this->CacheSequenceNumber++; 00208 } 00209 00210 if( $this->Cache == NULL or $this->UsingCache == FALSE ) 00211 { 00212 # if we have resumption token from prior query 00213 if (isset($this->ResumptionToken)) 00214 { 00215 # use resumption token as sole argument 00216 $Args["resumptionToken"] = $this->ResumptionToken; 00217 } 00218 else 00219 { 00220 # set up arguments for query 00221 $Args["metadataPrefix"] = $this->MetadataPrefix; 00222 if ($StartDate) { $Args["from"] = $StartDate; } 00223 if ($EndDate) { $Args["until"] = $EndDate; } 00224 if ($this->SetSpec) { $Args["set"] = $this->SetSpec; } 00225 } 00226 00227 # query server for XML text 00228 $XmlText = $this->PerformQuery("ListRecords", $Args); 00229 00230 if( $this->Cache != NULL ) 00231 { 00232 file_put_contents( $cache_fname, $XmlText ); 00233 } 00234 } 00235 else 00236 { 00237 # Get XML text from the cache 00238 $XmlText = file_get_contents( $cache_fname ); 00239 } 00240 00241 $this->DebugOutVar(8, __METHOD__,"XmlText",htmlspecialchars($XmlText)); 00242 00243 return $this->GetRecordsFromXML($XmlText, "listrecords" ); 00244 } 00245 00260 function GetRecord($Id) 00261 { 00262 $Args["metadataPrefix"] = $this->MetadataPrefix; 00263 $Args["identifier"] = $Id; 00264 00265 # query server for XML text 00266 $XmlText = $this->PerformQuery("GetRecord", $Args); 00267 $this->DebugOutVar(8, __METHOD__,"XmlText",htmlspecialchars($XmlText)); 00268 00269 return $this->GetRecordsFromXML($XmlText, "getrecord" ); 00270 } 00271 00277 function MoreRecordsAvailable() 00278 { 00279 return isset($this->ResumptionToken) ? TRUE : FALSE; 00280 } 00281 00285 function ResetRecordPointer() 00286 { 00287 unset($this->ResumptionToken); 00288 $this->CacheSequenceNumber = 0; 00289 } 00290 00296 function SetDebugLevel($NewLevel) 00297 { 00298 $this->DebugLevel = $NewLevel; 00299 } 00300 00301 00302 # ---- PRIVATE INTERFACE ------------------------------------------------- 00303 00304 private $ServerUrl; 00305 private $MetadataPrefix; 00306 private $SetSpec; 00307 private $DebugLevel; 00308 private $ResumptionToken; 00309 private $Cache; 00310 private $UsingCache; 00311 private $CacheSequenceNumber; 00312 00313 # perform OAI query and return resulting data to caller 00314 private function PerformQuery($QueryVerb, $Args = NULL) 00315 { 00316 # open stream to OAI server 00317 $QueryUrl = $this->ServerUrl."?verb=".$QueryVerb; 00318 if ($Args) 00319 { 00320 foreach ($Args as $ArgName => $ArgValue) 00321 { 00322 $QueryUrl .= "&".urlencode($ArgName)."=".urlencode($ArgValue); 00323 } 00324 } 00325 $FHndl = fopen($QueryUrl, "r"); 00326 00327 # if stream was successfully opened 00328 $Text = ""; 00329 if ($FHndl !== FALSE) 00330 { 00331 # while lines left in response 00332 while (!feof($FHndl)) 00333 { 00334 # read line from server and add it to text to be parsed 00335 $Text .= fread($FHndl, 10000000); 00336 } 00337 } 00338 00339 # close OAI server stream 00340 fclose($FHndl); 00341 00342 # return query result data to caller 00343 return $Text; 00344 } 00345 00346 # set array value if available in simplexml object 00347 private function GetValFromXml($Xml, $SrcName, $DstName, &$Results) 00348 { 00349 if (isset($Xml->$SrcName)) 00350 { 00351 $Results[$DstName] = trim($Xml->$SrcName); 00352 } 00353 } 00354 00355 # print variable contents if debug is above specified level 00356 private function DebugOutVar($Level, $MethodName, $VarName, $VarValue) 00357 { 00358 if ($this->DebugLevel >= $Level) 00359 { 00360 print("\n<pre>".$MethodName."() ".$VarName." = \n"); 00361 print_r($VarValue); 00362 print("</pre>\n"); 00363 } 00364 } 00365 00366 # Query has been sent, we need to retrieve records that came from it. 00367 private function GetRecordsFromXML($XmlText, $ParseTo ){ 00368 # create XML parser and pass it text 00369 $Parser = new XMLParser(); 00370 $Parser->ParseText($XmlText); 00371 $this->DebugOutVar(9, __METHOD__, "Parser", $Parser); 00372 00373 # if records were found 00374 $Records = array(); 00375 $ItemCount = $Parser->SeekTo("oai-pmh", $ParseTo, "record"); 00376 if ($ItemCount) 00377 { 00378 # for each record 00379 $Index = 0; 00380 do 00381 { 00382 # grab record identifier and date 00383 $Records[$Index]["identifier"]=$Parser->GetData("header", 00384 "identifier"); 00385 $Records[$Index]["datestamp"]=$Parser->GetData("header", 00386 "datestamp"); 00387 00388 # grab metadata 00389 $SeekResult = $Parser->SeekTo("metadata"); 00390 if ($SeekResult) 00391 { 00392 $SeekResult = $Parser->SeekToChild(); 00393 if ($SeekResult) 00394 { 00395 $Records[$Index]["format"] = $Parser->GetTagName(); 00396 $SeekResult = $Parser->SeekToChild(); 00397 if ($SeekResult) 00398 { 00399 $TagName = $Parser->GetTagName(); 00400 do 00401 { 00402 $Records[$Index]["metadata"][$TagName][] = 00403 $Parser->GetData(); 00404 } while ($TagName = $Parser->NextTag()); 00405 $Parser->SeekToParent(); 00406 } 00407 $Parser->SeekToParent(); 00408 } 00409 $Parser->SeekToParent(); 00410 } 00411 00412 # grab search info (if any) 00413 $SeekResult = $Parser->SeekTo("about"); 00414 if ($SeekResult) 00415 { 00416 $SeekResult = $Parser->SeekTo("searchInfo"); 00417 if ($SeekResult) 00418 { 00419 $SeekResult = $Parser->SeekToChild(); 00420 if ($SeekResult) 00421 { 00422 $TagName = $Parser->GetTagName(); 00423 do 00424 { 00425 $Records[$Index]["about"]["SEARCHINFO"][$TagName][] = 00426 $Parser->GetData(); 00427 } while ($TagName = $Parser->NextTag()); 00428 $Parser->SeekToParent(); 00429 } 00430 $Parser->SeekToParent(); 00431 } 00432 $Parser->SeekToParent(); 00433 } 00434 00435 $Index++; 00436 } 00437 while ($Parser->NextItem()); 00438 } 00439 00440 # look for resumption token and save if found 00441 $Parser->SeekToRoot(); 00442 $SeekResult = $Parser->SeekTo( 00443 "oai-pmh", "listrecords", "resumptiontoken"); 00444 if ($SeekResult !== NULL) 00445 { 00446 $this->ResumptionToken = $Parser->GetData(); 00447 } 00448 else 00449 { 00450 unset($this->ResumptionToken); 00451 } 00452 00453 # return records to caller 00454 return $Records; 00455 } 00456 00457 } 00458 00459 ?>