Blame view

sources/apps/search_lucene/document/Pdf.php 1.84 KB
42e4f8d60   Kload   add all apps
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
  <?php
  
  namespace OCA\Search_Lucene\Document;
  
  use \OCP\Util;
  /**
   * PDF document
   */
  class Pdf extends \Zend_Search_Lucene_Document
  {
  
      /**
       * Object constructor
       *
       * @param string  $data
       * @param boolean $storeContent
       */
      private function __construct($data, $storeContent) {
  
  		try {
  			$zendpdf = \Zend_Pdf::parse($data);
  
  			// Store meta data properties
  			if (isset($zendpdf->properties['Title'])) {
  				$this->addField(\Zend_Search_Lucene_Field::UnStored('title', $zendpdf->properties['Title']));
  			}
  			if (isset($zendpdf->properties['Author'])) {
  				$this->addField(\Zend_Search_Lucene_Field::UnStored('author', $zendpdf->properties['Author']));
  			}
  			if (isset($zendpdf->properties['Subject'])) {
  				$this->addField(\Zend_Search_Lucene_Field::UnStored('subject', $zendpdf->properties['Subject']));
  			}
  			if (isset($zendpdf->properties['Keywords'])) {
  				$this->addField(\Zend_Search_Lucene_Field::UnStored('keywords', $zendpdf->properties['Keywords']));
  			}
  			//TODO handle PDF 1.6 metadata Zend_Pdf::getMetadata()
  
  			//do the content extraction
  			$pdfParse = new \App_Search_Helper_PdfParser();
  			$body = $pdfParse->pdf2txt($zendpdf->render());
  
  
  			if ($body != '') {
  				// Store contents
  				if ($storeContent) {
  					$this->addField(\Zend_Search_Lucene_Field::Text('body', $body, 'UTF-8'));
  				} else {
  					$this->addField(\Zend_Search_Lucene_Field::UnStored('body', $body, 'UTF-8'));
  				}
  			}
  
  		} catch (\Exception $e) {
  			Util::writeLog('search_lucene',
  				$e->getMessage() . ' Trace:
  ' . $e->getTraceAsString(),
  				Util::ERROR);
  		}
  
      }
  
      /**
       * Load PDF document from a string
       *
       * @param string  $data
       * @param boolean $storeContent
       * @return Pdf
       */
      public static function loadPdf($data, $storeContent = false)
      {
          return new Pdf($data, false, $storeContent);
      }
  
  
  }