1 |
efrain |
1 |
<?php
|
|
|
2 |
// This file is part of Moodle - http://moodle.org/
|
|
|
3 |
//
|
|
|
4 |
// Moodle is free software: you can redistribute it and/or modify
|
|
|
5 |
// it under the terms of the GNU General Public License as published by
|
|
|
6 |
// the Free Software Foundation, either version 3 of the License, or
|
|
|
7 |
// (at your option) any later version.
|
|
|
8 |
//
|
|
|
9 |
// Moodle is distributed in the hope that it will be useful,
|
|
|
10 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
11 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
12 |
// GNU General Public License for more details.
|
|
|
13 |
//
|
|
|
14 |
// You should have received a copy of the GNU General Public License
|
|
|
15 |
// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
|
|
|
16 |
|
|
|
17 |
/**
|
|
|
18 |
* Document representation.
|
|
|
19 |
*
|
|
|
20 |
* @package search_solr
|
|
|
21 |
* @copyright 2015 David Monllao {@link http://www.davidmonllao.com}
|
|
|
22 |
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
|
|
|
23 |
*/
|
|
|
24 |
|
|
|
25 |
namespace search_solr;
|
|
|
26 |
|
|
|
27 |
defined('MOODLE_INTERNAL') || die();
|
|
|
28 |
|
|
|
29 |
/**
|
|
|
30 |
* Respresents a document to index.
|
|
|
31 |
*
|
|
|
32 |
* @copyright 2015 David Monllao {@link http://www.davidmonllao.com}
|
|
|
33 |
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
|
|
|
34 |
*/
|
|
|
35 |
class document extends \core_search\document {
|
|
|
36 |
/**
|
|
|
37 |
* Indicates the file contents were not indexed due to an error.
|
|
|
38 |
*/
|
|
|
39 |
const INDEXED_FILE_ERROR = -1;
|
|
|
40 |
|
|
|
41 |
/**
|
|
|
42 |
* Indicates the file contents were not indexed due filtering/settings.
|
|
|
43 |
*/
|
|
|
44 |
const INDEXED_FILE_FALSE = 0;
|
|
|
45 |
|
|
|
46 |
/**
|
|
|
47 |
* Indicates the file contents are indexed with the record.
|
|
|
48 |
*/
|
|
|
49 |
const INDEXED_FILE_TRUE = 1;
|
|
|
50 |
|
|
|
51 |
/**
|
|
|
52 |
* Any fields that are engine specifc. These are fields that are solely used by a seach engine plugin
|
|
|
53 |
* for internal purposes.
|
|
|
54 |
*
|
|
|
55 |
* @var array
|
|
|
56 |
*/
|
|
|
57 |
protected static $enginefields = array(
|
|
|
58 |
'solr_filegroupingid' => array(
|
|
|
59 |
'type' => 'string',
|
|
|
60 |
'stored' => true,
|
|
|
61 |
'indexed' => true
|
|
|
62 |
),
|
|
|
63 |
'solr_fileid' => array(
|
|
|
64 |
'type' => 'string',
|
|
|
65 |
'stored' => true,
|
|
|
66 |
'indexed' => true
|
|
|
67 |
),
|
|
|
68 |
'solr_filecontenthash' => array(
|
|
|
69 |
'type' => 'string',
|
|
|
70 |
'stored' => true,
|
|
|
71 |
'indexed' => true
|
|
|
72 |
),
|
|
|
73 |
// Stores the status of file indexing.
|
|
|
74 |
'solr_fileindexstatus' => array(
|
|
|
75 |
'type' => 'int',
|
|
|
76 |
'stored' => true,
|
|
|
77 |
'indexed' => true
|
|
|
78 |
),
|
|
|
79 |
// Field to index, but not store, file contents.
|
|
|
80 |
'solr_filecontent' => array(
|
|
|
81 |
'type' => 'text',
|
|
|
82 |
'stored' => false,
|
|
|
83 |
'indexed' => true,
|
|
|
84 |
'mainquery' => true
|
|
|
85 |
)
|
|
|
86 |
);
|
|
|
87 |
|
|
|
88 |
/**
|
|
|
89 |
* Formats the timestamp according to the search engine needs.
|
|
|
90 |
*
|
|
|
91 |
* @param int $timestamp
|
|
|
92 |
* @return string
|
|
|
93 |
*/
|
|
|
94 |
public static function format_time_for_engine($timestamp) {
|
|
|
95 |
return gmdate(\search_solr\engine::DATE_FORMAT, $timestamp);
|
|
|
96 |
}
|
|
|
97 |
|
|
|
98 |
/**
|
|
|
99 |
* Formats the timestamp according to the search engine needs.
|
|
|
100 |
*
|
|
|
101 |
* @param int $timestamp
|
|
|
102 |
* @return string
|
|
|
103 |
*/
|
|
|
104 |
public static function format_string_for_engine($string) {
|
|
|
105 |
// 2^15 default. We could convert this to a setting as is possible to
|
|
|
106 |
// change the max in solr.
|
|
|
107 |
return \core_text::str_max_bytes($string, 32766);
|
|
|
108 |
}
|
|
|
109 |
|
|
|
110 |
/**
|
|
|
111 |
* Returns a timestamp from the value stored in the search engine.
|
|
|
112 |
*
|
|
|
113 |
* @param string $time
|
|
|
114 |
* @return int
|
|
|
115 |
*/
|
|
|
116 |
public static function import_time_from_engine($time) {
|
|
|
117 |
return strtotime($time);
|
|
|
118 |
}
|
|
|
119 |
|
|
|
120 |
/**
|
|
|
121 |
* Overwritten to use HTML (highlighting).
|
|
|
122 |
*
|
|
|
123 |
* @return int
|
|
|
124 |
*/
|
|
|
125 |
protected function get_text_format() {
|
|
|
126 |
return FORMAT_HTML;
|
|
|
127 |
}
|
|
|
128 |
|
|
|
129 |
/**
|
|
|
130 |
* Formats a text string coming from the search engine.
|
|
|
131 |
*
|
|
|
132 |
* Even if this is called through an external function it is fine to return HTML as
|
|
|
133 |
* HTML is considered solr's search engine text format. An external function can ask
|
|
|
134 |
* for raw text, but this just means that it will not pass through format_text, no that
|
|
|
135 |
* we can not add HTML.
|
|
|
136 |
*
|
|
|
137 |
* @param string $text Text to format
|
|
|
138 |
* @return string HTML text to be renderer
|
|
|
139 |
*/
|
|
|
140 |
protected function format_text($text) {
|
|
|
141 |
// Since we allow output for highlighting, we need to encode html entities.
|
|
|
142 |
// This ensures plaintext html chars don't become valid html.
|
|
|
143 |
$out = s($text);
|
|
|
144 |
|
|
|
145 |
$startcount = 0;
|
|
|
146 |
$endcount = 0;
|
|
|
147 |
|
|
|
148 |
// Remove end/start pairs that span a few common seperation characters. Allows us to highlight phrases instead of words.
|
|
|
149 |
$regex = '|'.engine::HIGHLIGHT_END.'([ .,-]{0,3})'.engine::HIGHLIGHT_START.'|';
|
|
|
150 |
$out = preg_replace($regex, '$1', $out);
|
|
|
151 |
|
|
|
152 |
// Now replace our start and end highlight markers.
|
|
|
153 |
$out = str_replace(engine::HIGHLIGHT_START, '<span class="highlight">', $out, $startcount);
|
|
|
154 |
$out = str_replace(engine::HIGHLIGHT_END, '</span>', $out, $endcount);
|
|
|
155 |
|
|
|
156 |
// This makes sure any highlight tags are balanced, incase truncation or the highlight text contained our markers.
|
|
|
157 |
while ($startcount > $endcount) {
|
|
|
158 |
$out .= '</span>';
|
|
|
159 |
$endcount++;
|
|
|
160 |
}
|
|
|
161 |
while ($startcount < $endcount) {
|
|
|
162 |
$out = '<span class="highlight">' . $out;
|
|
|
163 |
$endcount++;
|
|
|
164 |
}
|
|
|
165 |
|
|
|
166 |
return parent::format_text($out);
|
|
|
167 |
}
|
|
|
168 |
|
|
|
169 |
/**
|
|
|
170 |
* Apply any defaults to unset fields before export. Called after document building, but before export.
|
|
|
171 |
*
|
|
|
172 |
* Sub-classes of this should make sure to call parent::apply_defaults().
|
|
|
173 |
*/
|
|
|
174 |
protected function apply_defaults() {
|
|
|
175 |
parent::apply_defaults();
|
|
|
176 |
|
|
|
177 |
// We want to set the solr_filegroupingid to id if it isn't set.
|
|
|
178 |
if (!isset($this->data['solr_filegroupingid'])) {
|
|
|
179 |
$this->data['solr_filegroupingid'] = $this->data['id'];
|
|
|
180 |
}
|
|
|
181 |
}
|
|
|
182 |
|
|
|
183 |
/**
|
|
|
184 |
* Export the data for the given file in relation to this document.
|
|
|
185 |
*
|
|
|
186 |
* @param \stored_file $file The stored file we are talking about.
|
|
|
187 |
* @return array
|
|
|
188 |
*/
|
|
|
189 |
public function export_file_for_engine($file) {
|
|
|
190 |
$data = $this->export_for_engine();
|
|
|
191 |
|
|
|
192 |
// Content is index in the main document.
|
|
|
193 |
unset($data['content']);
|
|
|
194 |
unset($data['description1']);
|
|
|
195 |
unset($data['description2']);
|
|
|
196 |
|
|
|
197 |
// Going to append the fileid to give it a unique id.
|
|
|
198 |
$data['id'] = $data['id'].'-solrfile'.$file->get_id();
|
|
|
199 |
$data['type'] = \core_search\manager::TYPE_FILE;
|
|
|
200 |
$data['solr_fileid'] = $file->get_id();
|
|
|
201 |
$data['solr_filecontenthash'] = $file->get_contenthash();
|
|
|
202 |
$data['solr_fileindexstatus'] = self::INDEXED_FILE_TRUE;
|
|
|
203 |
$data['title'] = $file->get_filename();
|
|
|
204 |
$data['modified'] = self::format_time_for_engine($file->get_timemodified());
|
|
|
205 |
|
|
|
206 |
return $data;
|
|
|
207 |
}
|
|
|
208 |
}
|