CWIS Developer Documentation
OAIClient.php
Go to the documentation of this file.
1 <?PHP
2 #
3 # FILE: OAIClient.php
4 # Provides a client for pulling data from OAI-PMH providers
5 # For protocol documentation, see:
6 # http://www.openarchives.org/OAI/openarchivesprotocol.html
7 #
8 # METHODS PROVIDED:
9 # OAIClient(ServerUrl, Cache)
10 # - constructor
11 # ServerUrl(NewValue)
12 # - Change the base url of the remote repository
13 # MetadataPrefix($pfx)
14 # - Set the schema we will request from remote
15 # SetSpec($set)
16 # - Restrict queries to a single set
17 # for details, see
18 # http://www.openarchives.org/OAI/openarchivesprotocol.html#Set
19 # GetIdentification()
20 # - Fetch identifying information about the remote repository
21 # GetFormats()
22 # - Fetch information about what schemas remote can serve
23 # GetRecords($start,$end)
24 # - Pull records in batches, optionally with date restrictions
25 # GetRecord($id)
26 # - Pull a single record using a unique identifier
27 # MoreRecordsAvailable()
28 # - Determine if a batch pull is complete or not
29 # ResetRecordPointer()
30 # - Restart a batch pull from the beginning
31 # SetDebugLevel()
32 # - Determine verbosity
33 #
34 # Copyright 2014 Edward Almasy and Internet Scout
35 # http://scout.wisc.edu
36 #
37 
38 class OAIClient {
39 
40  # ---- PUBLIC INTERFACE --------------------------------------------------
41 
48  function OAIClient($ServerUrl, $Cache=NULL)
49  {
50  # set default debug level
51  $this->DebugLevel = 0;
52 
53  # save OAI server URL
54  $this->ServerUrl = $ServerUrl;
55 
56  # set default metadata prefix
57  $this->MetadataPrefix = "oai_dc";
58 
59  # set default set specification for queries
60  $this->SetSpec = NULL;
61 
62  $this->CacheSequenceNumber = 0;
63  if ($Cache !== NULL)
64  {
65  $this->Cache = $Cache;
66  if (!is_dir($Cache))
67  {
68  mkdir($Cache);
69  }
70  }
71  }
72 
79  function ServerUrl($NewValue = NULL)
80  {
81  if ($NewValue != NULL)
82  {
83  $this->ServerUrl = $NewValue;
84  }
85  return $this->ServerUrl;
86  }
87 
94  function MetadataPrefix($NewValue = NULL)
95  {
96  if ($NewValue != NULL)
97  {
98  $this->MetadataPrefix = $NewValue;
99  }
100  return $this->MetadataPrefix;
101  }
102 
109  function SetSpec($NewValue = "X-NOSETSPECVALUE-X")
110  {
111  if ($NewValue != "X-NOSETSPECVALUE-X")
112  {
113  $this->SetSpec = $NewValue;
114  }
115  return $this->SetSpec;
116  }
117 
125  function GetIdentification()
126  {
127  # query server for XML text
128  $XmlText = $this->PerformQuery("Identify");
129  $this->DebugOutVar(8, __METHOD__, "XmlText", htmlspecialchars($XmlText));
130 
131  # convert XML text into object
132  $Xml = simplexml_load_string($XmlText);
133  $this->DebugOutVar(9, __METHOD__, "Xml", $Xml);
134 
135  # if identification info was found
136  $Info = array();
137  if (isset($Xml->Identify))
138  {
139  # extract info
140  $Ident = $Xml->Identify;
141  $this->GetValFromXml($Ident, "repositoryName", "Name", $Info);
142  $this->GetValFromXml($Ident, "adminEmail", "Email", $Info);
143  $this->GetValFromXml($Ident, "baseURL", "URL", $Info);
144  }
145 
146  # return info to caller
147  return $Info;
148  }
149 
155  function GetFormats()
156  {
157  # query server for XML text
158  $XmlText = $this->PerformQuery("ListMetadataFormats");
159  $this->DebugOutVar(8, __METHOD__, "XmlText", htmlspecialchars($XmlText));
160 
161  # convert XML text into object
162  $Xml = simplexml_load_string($XmlText);
163  $this->DebugOutVar(9, __METHOD__, "Xml", $Xml);
164 
165  # if format info was found
166  $Formats = array();
167  if (isset($Xml->ListMetadataFormats->metadataFormat))
168  {
169  # extract info
170  $Index = 0;
171  foreach ($Xml->ListMetadataFormats->metadataFormat as $Format)
172  {
173  $this->GetValFromXml(
174  $Format, "metadataPrefix", "Name", $Formats[$Index]);
175  $this->GetValFromXml(
176  $Format, "schema", "Schema", $Formats[$Index]);
177  $this->GetValFromXml(
178  $Format, "metadataNamespace", "Namespace",
179  $Formats[$Index]);
180  $Index++;
181  }
182  }
183 
184  # return info to caller
185  return $Formats;
186  }
187 
195  function GetRecords($StartDate = NULL, $EndDate = NULL)
196  {
197  # if we're using a cache directory, figure out which file
198  # should contain this set of records
199  if ($this->Cache !== NULL)
200  {
201  $cache_fname = sprintf("%s/%010x",
202  $this->Cache,
203  $this->CacheSequenceNumber);
204  $this->CacheSequenceNumber++;
205  }
206 
207  # when we're not using a cache or don't have a cached copy of
208  # this set of records, query the OAI provider to get it
209  if ($this->Cache === NULL || !file_exists($cache_fname) )
210  {
211  # if we have resumption token from prior query
212  if (isset($this->ResumptionToken))
213  {
214  # use resumption token as sole argument
215  $Args["resumptionToken"] = $this->ResumptionToken;
216  }
217  else
218  {
219  # set up arguments for query
220  $Args["metadataPrefix"] = $this->MetadataPrefix;
221  if ($StartDate) { $Args["from"] = $StartDate; }
222  if ($EndDate) { $Args["until"] = $EndDate; }
223  if ($this->SetSpec) { $Args["set"] = $this->SetSpec; }
224  }
225 
226  # query server for XML text
227  $XmlText = $this->PerformQuery("ListRecords", $Args);
228 
229  # if a cache is in use, save this chunk of XML into it
230  if ($this->Cache !== NULL)
231  {
232  file_put_contents($cache_fname, $XmlText);
233  }
234  }
235  else
236  {
237  # get XML text from the cache
238  $XmlText = file_get_contents($cache_fname);
239  }
240 
241  $this->DebugOutVar(8, __METHOD__, "XmlText", htmlspecialchars($XmlText));
242 
243  return $this->GetRecordsFromXML($XmlText, "ListRecords");
244  }
245 
260  function GetRecord($Id)
261  {
262  $Args["metadataPrefix"] = $this->MetadataPrefix;
263  $Args["identifier"] = $Id;
264 
265  # query server for XML text
266  $XmlText = $this->PerformQuery("GetRecord", $Args);
267  $this->DebugOutVar(8, __METHOD__, "XmlText", htmlspecialchars($XmlText));
268 
269  return $this->GetRecordsFromXML($XmlText, "GetRecord");
270  }
271 
278  {
279  return isset($this->ResumptionToken) ? TRUE : FALSE;
280  }
281 
286  {
287  unset($this->ResumptionToken);
288  $this->CacheSequenceNumber = 0;
289  }
290 
296  function SetDebugLevel($NewLevel)
297  {
298  $this->DebugLevel = $NewLevel;
299  }
300 
301 
302  # ---- PRIVATE INTERFACE -------------------------------------------------
303 
304  private $ServerUrl;
305  private $MetadataPrefix;
306  private $SetSpec;
307  private $DebugLevel;
308  private $ResumptionToken;
309  private $Cache;
310  private $CacheSequenceNumber;
311 
312  # perform OAI query and return resulting data to caller
313  private function PerformQuery($QueryVerb, $Args = NULL)
314  {
315  # open stream to OAI server
316 
317  if (strpos($this->ServerUrl, "?") === FALSE)
318  {
319  $QueryUrl = $this->ServerUrl."?verb=".$QueryVerb;
320  }
321  else
322  {
323  $QueryUrl = $this->ServerUrl."&verb=".$QueryVerb;
324  }
325 
326  if ($Args)
327  {
328  foreach ($Args as $ArgName => $ArgValue)
329  {
330  $QueryUrl .= "&".urlencode($ArgName)."=".urlencode($ArgValue);
331  }
332  }
333  $FHndl = fopen($QueryUrl, "r");
334 
335  # if stream was successfully opened
336  $Text = "";
337  if ($FHndl !== FALSE)
338  {
339  # while lines left in response
340  while (!feof($FHndl))
341  {
342  # read line from server and add it to text to be parsed
343  $Text .= fread($FHndl, 10000000);
344  }
345  }
346 
347  # close OAI server stream
348  fclose($FHndl);
349 
350  # return query result data to caller
351  return $Text;
352  }
353 
354  # set array value if available in simplexml object
355  private function GetValFromXml($Xml, $SrcName, $DstName, &$Results)
356  {
357  if (isset($Xml->$SrcName))
358  {
359  $Results[$DstName] = trim($Xml->$SrcName);
360  }
361  }
362 
363  # print variable contents if debug is above specified level
364  private function DebugOutVar($Level, $MethodName, $VarName, $VarValue)
365  {
366  if ($this->DebugLevel >= $Level)
367  {
368  print("\n<pre>".$MethodName."() ".$VarName." = \n");
369  print_r($VarValue);
370  print("</pre>\n");
371  }
372  }
373 
374  /*
375  * Pull records out of an XML DOMNode.
376  *
377  * Data converted from XML will be added to
378  * $Records[$Index][$Section], with the XML from the DOM node
379  * flattened. For example, if we were to call
380  * ExtractDataFromXml($Records, 0, $dom, "metadata") with $dom
381  * pointing to XML like this and $Records initially empty:
382  *
383  * @code
384  * <record xmlns="http://ns.nsdl.org/ncs/lar"
385  * xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
386  * xsi:schemaLocation="http://ns.nsdl.org/ncs/lar http://ns.nsdl.org/ncs/lar/1.00/schemas/lar.xsd">
387  * <recordID>2200/20121012134026795T</recordID>
388  * <recordDate>2012-07-24</recordDate>
389  * <identifier>http://chemteacher.chemeddl.org/services/chemteacher/index.php?option=com_content&amp;view=article&amp;id=77</identifier>
390  * <title>ChemTeacher: Periodic Table Resource Pak</title>
391  * <license>
392  * <name URL="http://creativecommons.org/licenses/by-sa/3.0/">Creative commons:Attribution share alike (by-sa)</name>
393  * <property>Attribution required</property>
394  * <property>Educational use only</property>
395  * <property>Share alike required</property>
396  * </license>
397  * </record>
398  * @endcode
399  *
400  * After the call, print_r($Records) would produce something like:
401  * @code
402  * Array
403  * (
404  * [0] => Array
405  * (
406  * [metadata] => Array
407  * (
408  * [recordID] => Array ( [0] => 2200/20121012134026795T )
409  * [recordDate] => Array ( [0] => 2012-07-24 )
410  * [identifier] => Array
411  * (
412  * [0] => http://chemteacher.chemeddl.org/services/chemteacher/index.php?option=com_content&view=article&id=77
413  * )
414  * [title] => Array ( [0] => ChemTeacher: Periodic Table Resource Pak )
415  * [license/name] => Array ( [0] => Creative commons:Attribution share alike (by-sa) )
416  * [license/property] => Array
417  * (
418  * [0] => Attribution required
419  * [1] => Educational use only
420  * [2] => Share alike required
421  * )
422  * )
423  * )
424  * )
425  * @endcode
426  *
427  * @param array $Records to place data in.
428  * @param int $Index record number to populate
429  * @param DOMNode $dom to extract data from
430  * @param string $Section section of the record to populate (e.g.,
431  * metadata, about)
432  * @param string $ParentTagName parent tag or null for the root of
433  * this record, should only be non-null when called recurisvely
434  * (OPTIONAL, default NULL)
435  */
436  private function ExtractDataFromXml(&$Records, $Index, DOMNode $dom, $Section, $ParentTagName=NULL)
437  {
438  foreach ($dom->childNodes as $node)
439  {
440  # for DOM children that are elements (rather than comments, text, or something else)
441  if ($node->nodeType == XML_ELEMENT_NODE)
442  {
443  # compute a tag name to use
444  $StorageTagName =
445  (($ParentTagName!==NULL) ? $ParentTagName."/" : "")
446  .$node->nodeName ;
447 
448  # Glue together the contents of the 'text' children of this node
449  $Value = "";
450  foreach ($node->childNodes as $child)
451  {
452  if ($child->nodeType == XML_TEXT_NODE)
453  {
454  $Value .= $child->nodeValue;
455  }
456  }
457 
458  # if we had a non-empty value, add it to the results
459  if (strlen(trim($Value))>0)
460  {
461  $Records[$Index][$Section][$StorageTagName] []= $Value;
462  }
463 
464  # and process our children
465  $this->ExtractDataFromXml($Records, $Index, $node, $Section, $StorageTagName);
466  }
467  }
468  }
469 
476  private function GetFirstElement(DOMNode $dom)
477  {
478  foreach ($dom->childNodes as $child)
479  {
480  if ($child->nodeType == XML_ELEMENT_NODE)
481  {
482  return $child;
483  }
484  }
485 
486  return NULL;
487  }
488 
505  private function GetRecordsFromXML($XmlText, $ParseTo)
506  {
507  # create XML parser and pass it text
508  $Xml = simplexml_load_string($XmlText);
509 
510  # if text could not be parsed, return NULL
511  if (! $Xml instanceof SimpleXmlElement )
512  return NULL;
513 
514  # set up vars to hold our results
515  $Records = array();
516  $Index = 0;
517 
518  # we'll want to find our records with XPath, so we need to
519  # register a prefix for the oai elements
520  $Xml->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/");
521 
522  # extract records, iterate over them
523  $RecordXML = $Xml->xpath("oai:".$ParseTo."//oai:record");
524  foreach ($RecordXML as $Record)
525  {
526  # pull relevant information out of the header
527  #
528  # Note that SimpleXMLElement objects map elements onto PHP
529  # object properties, and will return a SimpleXMLElement w/o
530  # any associated XML for non-existent elements. So,
531  # nothing explodes when we ask the Record for an element it
532  # did not contain.
533  #
534  # However, SimpleXMLElements w/o associated XML return
535  # 'NULL' for all properties. Therefore, if we tried to
536  # look at the grandchild of a non-existent element it would
537  # be problematic. In the cases below, we get empty
538  # strings when the children of 'header' &c are empty, which
539  # is what we want anyway.
540 
541  $Records[$Index]["identifier"] = (string) $Record->header->identifier;
542  $Records[$Index]["datestamp"] = (string) $Record->header->datestamp;
543 
544  # grab associated meadata (if there is any)
545  if ($Record->metadata->count() > 0)
546  {
547  # to avoid frustrations with namespaces and SimpleXML, use
548  # DOMDocument to parse the record data
549  $doc = dom_import_simplexml( $Record->metadata );
550 
551  # get the 'record' element
552  $doc = $this->GetFirstElement( $doc );
553 
554  # record the format used for this record
555  $Records[$Index]["format"] = $doc->nodeName;
556 
557  # extract data for this record
558  $this->ExtractDataFromXml( $Records, $Index, $doc, "metadata" );
559  }
560 
561  # if there is additional information available, snag that too
562  if ($Record->about->count() > 0)
563  {
564  $doc = dom_import_simplexml( $Record->about );
565  $this->ExtractDataFromXml($Records, $Index, $doc, "about");
566  }
567 
568  # move along to the next record
569  $Index++;
570  }
571 
572  # look for resumption token and save if found (as above, we'll
573  # get an empty string if either ListRecords or resumptionToken
574  # are absent)
575  $Token = (string) $Xml->ListRecords->resumptionToken;
576 
577  if (strlen($Token)>0)
578  {
579  $this->ResumptionToken = $Token;
580  }
581  else
582  {
583  unset($this->ResumptionToken);
584  }
585 
586  # return records to caller
587  return $Records;
588  }
589 }
ResetRecordPointer()
Clear any additional records available after last GetRecords().
Definition: OAIClient.php:285
ServerUrl($NewValue=NULL)
Get or set URL of target OAI repository server.
Definition: OAIClient.php:79
OAIClient($ServerUrl, $Cache=NULL)
Class constructor.
Definition: OAIClient.php:48
GetRecord($Id)
Get a single record from a repositry server.
Definition: OAIClient.php:260
MoreRecordsAvailable()
Check whether more records are available after last GetRecords().
Definition: OAIClient.php:277
GetRecords($StartDate=NULL, $EndDate=NULL)
Retrieve records from repository server.
Definition: OAIClient.php:195
MetadataPrefix($NewValue=NULL)
Get or set metadata schema for records being retrieved.
Definition: OAIClient.php:94
SetSpec($NewValue="X-NOSETSPECVALUE-X")
Get or set specification of subset of records to be retrieved.
Definition: OAIClient.php:109
GetIdentification()
Retrieve identification information from repository server.
Definition: OAIClient.php:125
SetDebugLevel($NewLevel)
Set current debug output level.
Definition: OAIClient.php:296
GetFormats()
Retrieve list of available metadata formats from repository server.
Definition: OAIClient.php:155