OAIClient.php
Go to the documentation of this file.
00001 <?PHP 00002 00003 # 00004 # FILE: Scout--OAIClient.php 00005 # Provides a client for pulling data from OAI-PMH providers 00006 # For protocol documentation, see: 00007 # http://www.openarchives.org/OAI/openarchivesprotocol.html 00008 # 00009 # METHODS PROVIDED: 00010 # OAIClient(ServerUrl, Cache) 00011 # - constructor 00012 # ServerUrl(NewValue) 00013 # - Change the base url of the remote repository 00014 # MetadataPrefix($pfx) 00015 # - Set the schema we will request from remote 00016 # SetSpec($set) 00017 # - Restrict queries to a single set 00018 # for details, see 00019 # http://www.openarchives.org/OAI/openarchivesprotocol.html#Set 00020 # GetIdentification() 00021 # - Fetch identifying information about the remote repository 00022 # GetFormats() 00023 # - Fetch information about what schemas remote can serve 00024 # GetRecords($start,$end) 00025 # - Pull records in batches, optionally with date restrictions 00026 # GetRecord($id) 00027 # - Pull a single record using a unique identifier 00028 # MoreRecordsAvailable() 00029 # - Determine if a batch pull is complete or not 00030 # ResetRecordPointer() 00031 # - Restart a batch pull from the beginning 00032 # SetDebugLevel() 00033 # - Determine verbosity 00034 # 00035 # Copyright 2008 Edward Almasy and Internet Scout 00036 # http://scout.wisc.edu 00037 # 00038 00039 require_once("XMLParser.php"); 00040 00041 00042 class OAIClient { 00043 00044 # ---- PUBLIC INTERFACE -------------------------------------------------- 00045 00052 function OAIClient($ServerUrl, $Cache=NULL) 00053 { 00054 # set default debug level 00055 $this->DebugLevel = 0; 00056 00057 # save OAI server URL 00058 $this->ServerUrl = $ServerUrl; 00059 00060 # set default metadata prefix 00061 $this->MetadataPrefix = "oai_dc"; 00062 00063 # set default set specification for queries 00064 $this->SetSpec = NULL; 00065 00066 $this->CacheSequenceNumber = 0; 00067 if ($Cache !== NULL) 00068 { 00069 $this->Cache = $Cache; 00070 $this->UsingCache = is_dir($Cache); 00071 if ($this->UsingCache == FALSE ) 00072 { 00073 mkdir($Cache); 00074 } 00075 } 00076 } 00077 00084 function ServerUrl($NewValue = NULL) 00085 { 00086 if ($NewValue != NULL) 00087 { 00088 $this->ServerUrl = $NewValue; 00089 } 00090 return $this->ServerUrl; 00091 } 00092 00099 function MetadataPrefix($NewValue = NULL) 00100 { 00101 if ($NewValue != NULL) 00102 { 00103 $this->MetadataPrefix = $NewValue; 00104 } 00105 return $this->MetadataPrefix; 00106 } 00107 00114 function SetSpec($NewValue = "X-NOSETSPECVALUE-X") 00115 { 00116 if ($NewValue != "X-NOSETSPECVALUE-X") 00117 { 00118 $this->SetSpec = $NewValue; 00119 } 00120 return $this->SetSpec; 00121 } 00122 00130 function GetIdentification() 00131 { 00132 # query server for XML text 00133 $XmlText = $this->PerformQuery("Identify"); 00134 $this->DebugOutVar(8,__METHOD__,"XmlText",htmlspecialchars($XmlText)); 00135 00136 # convert XML text into object 00137 $Xml = simplexml_load_string($XmlText); 00138 $this->DebugOutVar(9, __METHOD__, "Xml", $Xml); 00139 00140 # if identification info was found 00141 $Info = array(); 00142 if (isset($Xml->Identify)) 00143 { 00144 # extract info 00145 $Ident = $Xml->Identify; 00146 $this->GetValFromXml($Ident, "repositoryName", "Name", $Info); 00147 $this->GetValFromXml($Ident, "adminEmail", "Email", $Info); 00148 $this->GetValFromXml($Ident, "baseURL", "URL", $Info); 00149 } 00150 00151 # return info to caller 00152 return $Info; 00153 } 00154 00160 function GetFormats() 00161 { 00162 # query server for XML text 00163 $XmlText = $this->PerformQuery("ListMetadataFormats"); 00164 $this->DebugOutVar(8,__METHOD__,"XmlText",htmlspecialchars($XmlText)); 00165 00166 # convert XML text into object 00167 $Xml = simplexml_load_string($XmlText); 00168 $this->DebugOutVar(9, __METHOD__, "Xml", $Xml); 00169 00170 # if format info was found 00171 $Formats = array(); 00172 if (isset($Xml->ListMetadataFormats->metadataFormat)) 00173 { 00174 # extract info 00175 $Index = 0; 00176 foreach ($Xml->ListMetadataFormats->metadataFormat as $Format) 00177 { 00178 $this->GetValFromXml( 00179 $Format, "metadataPrefix", "Name", $Formats[$Index]); 00180 $this->GetValFromXml( 00181 $Format, "schema", "Schema", $Formats[$Index]); 00182 $this->GetValFromXml( 00183 $Format, "metadataNamespace", "Namespace", 00184 $Formats[$Index]); 00185 $Index++; 00186 } 00187 } 00188 00189 # return info to caller 00190 return $Formats; 00191 } 00192 00200 function GetRecords($StartDate = NULL, $EndDate = NULL) 00201 { 00202 if( $this->Cache != NULL ) 00203 { 00204 $cache_fname = sprintf("%s/%010x", 00205 $this->Cache, 00206 $this->CacheSequenceNumber); 00207 $this->CacheSequenceNumber++; 00208 } 00209 00210 if( $this->Cache == NULL or $this->UsingCache == FALSE ) 00211 { 00212 # if we have resumption token from prior query 00213 if (isset($this->ResumptionToken)) 00214 { 00215 # use resumption token as sole argument 00216 $Args["resumptionToken"] = $this->ResumptionToken; 00217 } 00218 else 00219 { 00220 # set up arguments for query 00221 $Args["metadataPrefix"] = $this->MetadataPrefix; 00222 if ($StartDate) { $Args["from"] = $StartDate; } 00223 if ($EndDate) { $Args["until"] = $EndDate; } 00224 if ($this->SetSpec) { $Args["set"] = $this->SetSpec; } 00225 } 00226 00227 # query server for XML text 00228 $XmlText = $this->PerformQuery("ListRecords", $Args); 00229 00230 if( $this->Cache != NULL ) 00231 { 00232 file_put_contents( $cache_fname, $XmlText ); 00233 } 00234 } 00235 else 00236 { 00237 # Get XML text from the cache 00238 $XmlText = file_get_contents( $cache_fname ); 00239 } 00240 00241 $this->DebugOutVar(8, __METHOD__,"XmlText",htmlspecialchars($XmlText)); 00242 00243 return $this->GetRecordsFromXML($XmlText, "listrecords" ); 00244 } 00245 00260 function GetRecord($Id) 00261 { 00262 $Args["metadataPrefix"] = $this->MetadataPrefix; 00263 $Args["identifier"] = $Id; 00264 00265 # query server for XML text 00266 $XmlText = $this->PerformQuery("GetRecord", $Args); 00267 $this->DebugOutVar(8, __METHOD__,"XmlText",htmlspecialchars($XmlText)); 00268 00269 return $this->GetRecordsFromXML($XmlText, "getrecord" ); 00270 } 00271 00277 function MoreRecordsAvailable() 00278 { 00279 return isset($this->ResumptionToken) ? TRUE : FALSE; 00280 } 00281 00285 function ResetRecordPointer() 00286 { 00287 unset($this->ResumptionToken); 00288 $this->CacheSequenceNumber = 0; 00289 } 00290 00296 function SetDebugLevel($NewLevel) 00297 { 00298 $this->DebugLevel = $NewLevel; 00299 } 00300 00301 00302 # ---- PRIVATE INTERFACE ------------------------------------------------- 00303 00304 private $ServerUrl; 00305 private $MetadataPrefix; 00306 private $SetSpec; 00307 private $DebugLevel; 00308 private $ResumptionToken; 00309 private $Cache; 00310 private $UsingCache; 00311 private $CacheSequenceNumber; 00312 00313 # perform OAI query and return resulting data to caller 00314 private function PerformQuery($QueryVerb, $Args = NULL) 00315 { 00316 # open stream to OAI server 00317 00318 if (strpos($this->ServerUrl, "?") === FALSE) 00319 { 00320 $QueryUrl = $this->ServerUrl."?verb=".$QueryVerb; 00321 } 00322 else 00323 { 00324 $QueryUrl = $this->ServerUrl."&verb=".$QueryVerb; 00325 } 00326 00327 if ($Args) 00328 { 00329 foreach ($Args as $ArgName => $ArgValue) 00330 { 00331 $QueryUrl .= "&".urlencode($ArgName)."=".urlencode($ArgValue); 00332 } 00333 } 00334 $FHndl = fopen($QueryUrl, "r"); 00335 00336 # if stream was successfully opened 00337 $Text = ""; 00338 if ($FHndl !== FALSE) 00339 { 00340 # while lines left in response 00341 while (!feof($FHndl)) 00342 { 00343 # read line from server and add it to text to be parsed 00344 $Text .= fread($FHndl, 10000000); 00345 } 00346 } 00347 00348 # close OAI server stream 00349 fclose($FHndl); 00350 00351 # return query result data to caller 00352 return $Text; 00353 } 00354 00355 # set array value if available in simplexml object 00356 private function GetValFromXml($Xml, $SrcName, $DstName, &$Results) 00357 { 00358 if (isset($Xml->$SrcName)) 00359 { 00360 $Results[$DstName] = trim($Xml->$SrcName); 00361 } 00362 } 00363 00364 # print variable contents if debug is above specified level 00365 private function DebugOutVar($Level, $MethodName, $VarName, $VarValue) 00366 { 00367 if ($this->DebugLevel >= $Level) 00368 { 00369 print("\n<pre>".$MethodName."() ".$VarName." = \n"); 00370 print_r($VarValue); 00371 print("</pre>\n"); 00372 } 00373 } 00374 00375 # Recursively dump tags inside a metadata section, flattening them 00376 # as we go. 00377 private function DumpTagsRecursive(&$Records, $Index, $Parser, $ParentTagName=NULL) 00378 { 00379 $TagName = $Parser->GetTagName(); 00380 do 00381 { 00382 $StorageTagName = ($ParentTagName!==NULL) ? 00383 $ParentTagName."/".$TagName : $TagName; 00384 00385 if ($Parser->SeekToChild() ){ 00386 $this->DumpTagsRecursive( $Records, $Index, $Parser, $StorageTagName ); 00387 $Parser->SeekToParent(); 00388 } 00389 else 00390 { 00391 $Records[$Index]["metadata"][$StorageTagName][] = $Parser->GetData(); 00392 } 00393 } while ($TagName = $Parser->NextTag()); 00394 } 00395 00396 # Query has been sent, we need to retrieve records that came from it. 00397 private function GetRecordsFromXML($XmlText, $ParseTo ){ 00398 # create XML parser and pass it text 00399 $Parser = new XMLParser(); 00400 $Parser->ParseText($XmlText); 00401 00402 $this->DebugOutVar(9, __METHOD__, "Parser", $Parser); 00403 00404 # if records were found 00405 $Records = array(); 00406 $ItemCount = $Parser->SeekTo("oai-pmh", $ParseTo, "record"); 00407 if ($ItemCount) 00408 { 00409 # for each record 00410 $Index = 0; 00411 do 00412 { 00413 # grab record identifier and date 00414 $Records[$Index]["identifier"]=$Parser->GetData("header", 00415 "identifier"); 00416 $Records[$Index]["datestamp"]=$Parser->GetData("header", 00417 "datestamp"); 00418 00419 # grab metadata 00420 $SeekResult = $Parser->SeekTo("metadata"); 00421 if ($SeekResult) 00422 { 00423 $SeekResult = $Parser->SeekToChild(); 00424 if ($SeekResult) 00425 { 00426 $Records[$Index]["format"] = $Parser->GetTagName(); 00427 $SeekResult = $Parser->SeekToChild(); 00428 if ($SeekResult) 00429 { 00430 $this->DumpTagsRecursive($Records, $Index, $Parser); 00431 $Parser->SeekToParent(); 00432 } 00433 $Parser->SeekToParent(); 00434 } 00435 $Parser->SeekToParent(); 00436 } 00437 00438 # grab search info (if any) 00439 $SeekResult = $Parser->SeekTo("about"); 00440 if ($SeekResult) 00441 { 00442 $SeekResult = $Parser->SeekTo("searchInfo"); 00443 if ($SeekResult) 00444 { 00445 $SeekResult = $Parser->SeekToChild(); 00446 if ($SeekResult) 00447 { 00448 $TagName = $Parser->GetTagName(); 00449 do 00450 { 00451 $Records[$Index]["about"]["SEARCHINFO"][$TagName][] = 00452 $Parser->GetData(); 00453 } while ($TagName = $Parser->NextTag()); 00454 $Parser->SeekToParent(); 00455 } 00456 $Parser->SeekToParent(); 00457 } 00458 $Parser->SeekToParent(); 00459 } 00460 00461 $Index++; 00462 } 00463 while ($Parser->NextItem()); 00464 } 00465 00466 # look for resumption token and save if found 00467 $Parser->SeekToRoot(); 00468 $SeekResult = $Parser->SeekTo( 00469 "oai-pmh", "listrecords", "resumptiontoken"); 00470 if ($SeekResult !== NULL) 00471 { 00472 $this->ResumptionToken = $Parser->GetData(); 00473 } 00474 else 00475 { 00476 unset($this->ResumptionToken); 00477 } 00478 00479 # return records to caller 00480 return $Records; 00481 } 00482 00483 } 00484 00485 ?>