Orcus
sax_parser.hpp
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 */
7
8#ifndef ORCUS_SAX_PARSER_HPP
9#define ORCUS_SAX_PARSER_HPP
10
11#include "sax_parser_base.hpp"
12
13namespace orcus {
14
16{
22 static const uint8_t baseline_version = 10;
23};
24
29template<typename _Handler, typename _Config = sax_parser_default_config>
31{
32public:
33 typedef _Handler handler_type;
34 typedef _Config config_type;
35
36 sax_parser(const char* content, const size_t size, handler_type& handler);
37 sax_parser(const char* content, const size_t size, bool transient_stream, handler_type& handler);
39
40 void parse();
41
42private:
43
48 void header();
49 void body();
50 void element();
51 void element_open(std::ptrdiff_t begin_pos);
52 void element_close(std::ptrdiff_t begin_pos);
53 void special_tag();
54 void declaration(const char* name_check);
55 void cdata();
56 void doctype();
57 void characters();
58 void attribute();
59
60private:
61 handler_type& m_handler;
62};
63
64template<typename _Handler, typename _Config>
66 const char* content, const size_t size, handler_type& handler) :
67 sax::parser_base(content, size, false),
68 m_handler(handler)
69{
70}
71
72template<typename _Handler, typename _Config>
73sax_parser<_Handler,_Config>::sax_parser(
74 const char* content, const size_t size, bool transient_stream, handler_type& handler) :
75 sax::parser_base(content, size, transient_stream),
76 m_handler(handler)
77{
78}
79
80template<typename _Handler, typename _Config>
81sax_parser<_Handler,_Config>::~sax_parser()
82{
83}
84
85template<typename _Handler, typename _Config>
86void sax_parser<_Handler,_Config>::parse()
87{
88 m_nest_level = 0;
89 mp_char = mp_begin;
90 header();
91 skip_space_and_control();
92 body();
93
94 assert(m_buffer_pos == 0);
95}
96
97template<typename _Handler, typename _Config>
98void sax_parser<_Handler,_Config>::header()
99{
100 // we don't handle multi byte encodings so we can just skip bom entry if exists.
101 skip_bom();
102 skip_space_and_control();
103 if (!has_char() || cur_char() != '<')
104 throw sax::malformed_xml_error("xml file must begin with '<'.", offset());
105
106 if (config_type::baseline_version >= 11)
107 {
108 // XML version 1.1 requires a header declaration whereas in 1.0 it's
109 // optional.
110 if (next_char_checked() != '?')
111 throw sax::malformed_xml_error("xml file must begin with '<?'.", offset());
112
113 declaration("xml");
114 }
115}
116
117template<typename _Handler, typename _Config>
118void sax_parser<_Handler,_Config>::body()
119{
120 while (has_char())
121 {
122 if (cur_char() == '<')
123 {
124 element();
125 if (!m_root_elem_open)
126 // Root element closed. Stop parsing.
127 return;
128 }
129 else if (m_nest_level)
130 // Call characters only when in xml hierarchy.
131 characters();
132 else
133 next();
134 }
135}
136
137template<typename _Handler, typename _Config>
138void sax_parser<_Handler,_Config>::element()
139{
140 assert(cur_char() == '<');
141 std::ptrdiff_t pos = offset();
142 char c = next_char_checked();
143 switch (c)
144 {
145 case '/':
146 element_close(pos);
147 break;
148 case '!':
149 special_tag();
150 break;
151 case '?':
152 declaration(nullptr);
153 break;
154 default:
155 if (!is_alpha(c) && c != '_')
156 throw sax::malformed_xml_error("expected an alphabet.", offset());
157 element_open(pos);
158 }
159}
160
161template<typename _Handler, typename _Config>
162void sax_parser<_Handler,_Config>::element_open(std::ptrdiff_t begin_pos)
163{
164 assert(is_alpha(cur_char()) || cur_char() == '_');
165
166 sax::parser_element elem;
167 element_name(elem, begin_pos);
168
169 while (true)
170 {
171 skip_space_and_control();
172 char c = cur_char();
173 if (c == '/')
174 {
175 // Self-closing element: <element/>
176 if (next_and_char() != '>')
177 throw sax::malformed_xml_error("expected '/>' to self-close the element.", offset());
178 next();
179 elem.end_pos = offset();
180 m_handler.start_element(elem);
181 reset_buffer_pos();
182 m_handler.end_element(elem);
183 if (!m_nest_level)
184 m_root_elem_open = false;
185#if ORCUS_DEBUG_SAX_PARSER
186 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
187#endif
188 return;
189 }
190 else if (c == '>')
191 {
192 // End of opening element: <element>
193 next();
194 elem.end_pos = offset();
195 nest_up();
196 m_handler.start_element(elem);
197 reset_buffer_pos();
198#if ORCUS_DEBUG_SAX_PARSER
199 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
200#endif
201 return;
202 }
203 else
204 attribute();
205 }
206}
207
208template<typename _Handler, typename _Config>
209void sax_parser<_Handler,_Config>::element_close(std::ptrdiff_t begin_pos)
210{
211 assert(cur_char() == '/');
212 nest_down();
213 next_check();
214 sax::parser_element elem;
215 element_name(elem, begin_pos);
216
217 if (cur_char() != '>')
218 throw sax::malformed_xml_error("expected '>' to close the element.", offset());
219 next();
220 elem.end_pos = offset();
221
222 m_handler.end_element(elem);
223#if ORCUS_DEBUG_SAX_PARSER
224 cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
225#endif
226 if (!m_nest_level)
227 m_root_elem_open = false;
228}
229
230template<typename _Handler, typename _Config>
231void sax_parser<_Handler,_Config>::special_tag()
232{
233 assert(cur_char() == '!');
234 // This can be either <![CDATA, <!--, or <!DOCTYPE.
235 size_t len = remains();
236 if (len < 2)
237 throw sax::malformed_xml_error("special tag too short.", offset());
238
239 switch (next_and_char())
240 {
241 case '-':
242 {
243 // Possibly comment.
244 if (next_and_char() != '-')
245 throw sax::malformed_xml_error("comment expected.", offset());
246
247 len -= 2;
248 if (len < 3)
249 throw sax::malformed_xml_error("malformed comment.", offset());
250
251 next();
252 comment();
253 }
254 break;
255 case '[':
256 {
257 // Possibly a CDATA.
258 expects_next("CDATA[", 6);
259 if (has_char())
260 cdata();
261 }
262 break;
263 case 'D':
264 {
265 // check if this is a DOCTYPE.
266 expects_next("OCTYPE", 6);
267 skip_space_and_control();
268 if (has_char())
269 doctype();
270 }
271 break;
272 default:
273 throw sax::malformed_xml_error("failed to parse special tag.", offset());
274 }
275}
276
277template<typename _Handler, typename _Config>
278void sax_parser<_Handler,_Config>::declaration(const char* name_check)
279{
280 assert(cur_char() == '?');
281 next_check();
282
283 // Get the declaration name first.
284 pstring decl_name;
285 name(decl_name);
286#if ORCUS_DEBUG_SAX_PARSER
287 cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
288#endif
289
290 if (name_check && decl_name != name_check)
291 {
292 std::ostringstream os;
293 os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
294 throw sax::malformed_xml_error(os.str(), offset());
295 }
296
297 m_handler.start_declaration(decl_name);
298 skip_space_and_control();
299
300 // Parse the attributes.
301 while (cur_char_checked() != '?')
302 {
303 attribute();
304 skip_space_and_control();
305 }
306 if (next_char_checked() != '>')
307 throw sax::malformed_xml_error("declaration must end with '?>'.", offset());
308
309 m_handler.end_declaration(decl_name);
310 reset_buffer_pos();
311 next();
312#if ORCUS_DEBUG_SAX_PARSER
313 cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
314#endif
315}
316
317template<typename _Handler, typename _Config>
318void sax_parser<_Handler,_Config>::cdata()
319{
320 size_t len = remains();
321 assert(len > 3);
322
323 // Parse until we reach ']]>'.
324 const char* p0 = mp_char;
325 size_t i = 0, match = 0;
326 for (char c = cur_char(); i < len; ++i, c = next_and_char())
327 {
328 if (c == ']')
329 {
330 // Be aware that we may encounter a series of more than two ']'
331 // characters, in which case we'll only count the last two.
332
333 if (match == 0)
334 // First ']'
335 ++match;
336 else if (match == 1)
337 // Second ']'
338 ++match;
339 }
340 else if (c == '>' && match == 2)
341 {
342 // Found ']]>'.
343 size_t cdata_len = i - 2;
344 m_handler.characters(pstring(p0, cdata_len), transient_stream());
345 next();
346 return;
347 }
348 else
349 match = 0;
350 }
351 throw sax::malformed_xml_error("malformed CDATA section.", offset());
352}
353
354template<typename _Handler, typename _Config>
355void sax_parser<_Handler,_Config>::doctype()
356{
357 // Parse the root element first.
358 sax::doctype_declaration param;
359 name(param.root_element);
360 skip_space_and_control();
361
362 // Either PUBLIC or SYSTEM.
363 size_t len = remains();
364 if (len < 6)
365 throw sax::malformed_xml_error("DOCTYPE section too short.", offset());
366
367 param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
368 char c = cur_char();
369 if (c == 'P')
370 {
371 if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
372 throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
373
374 param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
375 }
376 else if (c == 'S')
377 {
378 if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
379 throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
380 }
381
382 next_check();
383 skip_space_and_control();
384 has_char_throw("DOCTYPE section too short.");
385
386 // Parse FPI.
387 value(param.fpi, false);
388
389 has_char_throw("DOCTYPE section too short.");
390 skip_space_and_control();
391 has_char_throw("DOCTYPE section too short.");
392
393 if (cur_char() == '>')
394 {
395 // Optional URI not given. Exit.
396#if ORCUS_DEBUG_SAX_PARSER
397 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
398#endif
399 m_handler.doctype(param);
400 next();
401 return;
402 }
403
404 // Parse optional URI.
405 value(param.uri, false);
406
407 has_char_throw("DOCTYPE section too short.");
408 skip_space_and_control();
409 has_char_throw("DOCTYPE section too short.");
410
411 if (cur_char() != '>')
412 throw sax::malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
413
414#if ORCUS_DEBUG_SAX_PARSER
415 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
416#endif
417 m_handler.doctype(param);
418 next();
419}
420
421template<typename _Handler, typename _Config>
422void sax_parser<_Handler,_Config>::characters()
423{
424 const char* p0 = mp_char;
425 for (; has_char(); next())
426 {
427 if (cur_char() == '<')
428 break;
429
430 if (cur_char() == '&')
431 {
432 // Text span with one or more encoded characters. Parse using cell buffer.
433 cell_buffer& buf = get_cell_buffer();
434 buf.reset();
435 buf.append(p0, mp_char-p0);
436 characters_with_encoded_char(buf);
437 if (buf.empty())
438 m_handler.characters(pstring(), transient_stream());
439 else
440 m_handler.characters(pstring(buf.get(), buf.size()), true);
441 return;
442 }
443 }
444
445 if (mp_char > p0)
446 {
447 pstring val(p0, mp_char-p0);
448 m_handler.characters(val, transient_stream());
449 }
450}
451
452template<typename _Handler, typename _Config>
453void sax_parser<_Handler,_Config>::attribute()
454{
455 sax::parser_attribute attr;
456 pstring attr_ns_name, attr_name, attr_value;
457 attribute_name(attr.ns, attr.name);
458
459#if ORCUS_DEBUG_SAX_PARSER
460 cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl;
461#endif
462
463 skip_space_and_control();
464
465 char c = cur_char();
466 if (c != '=')
467 {
468 std::ostringstream os;
469 os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
470 throw sax::malformed_xml_error(os.str(), offset());
471 }
472
473 next_check(); // skip the '='.
474 skip_space_and_control();
475
476 attr.transient = value(attr.value, true);
477 if (attr.transient)
478 // Value is stored in a temporary buffer. Push a new buffer.
479 inc_buffer_pos();
480
481#if ORCUS_DEBUG_SAX_PARSER
482 cout << "sax_parser::attribute: value='" << attr.value << "'" << endl;
483#endif
484
485 m_handler.attribute(attr);
486}
487
488}
489
490#endif
491/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: parser_base.hpp:40
Definition: sax_parser_base.hpp:109
Definition: sax_parser.hpp:31
Definition: sax_parser.hpp:16
static const uint8_t baseline_version
Definition: sax_parser.hpp:22