Orcus
csv_parser.hpp
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 */
7
8#ifndef ORCUS_CSV_PARSER_HPP
9#define ORCUS_CSV_PARSER_HPP
10
11#include "csv_parser_base.hpp"
12
13namespace orcus {
14
15template<typename _Handler>
17{
18public:
19 typedef _Handler handler_type;
20
21 csv_parser(const char* p, size_t n, handler_type& hdl, const csv::parser_config& config);
22 void parse();
23
24private:
25
26 // handlers
27 void row();
28 void cell();
29 void quoted_cell();
30
31 void parse_cell_with_quote(const char* p0, size_t len0);
32
36 void push_cell_value(const char* p, size_t n);
37
38private:
39 handler_type& m_handler;
40};
41
42template<typename _Handler>
44 const char* p, size_t n, handler_type& hdl, const csv::parser_config& config) :
45 csv::parser_base(p, n, config), m_handler(hdl) {}
46
47template<typename _Handler>
48void csv_parser<_Handler>::parse()
49{
50#if ORCUS_DEBUG_CSV
51 for (const char* p = mp_begin; p < mp_end; ++p)
52 std::cout << *p;
53 std::cout << std::endl;
54#endif
55
56 m_handler.begin_parse();
57 while (has_char())
58 row();
59 m_handler.end_parse();
60}
61
62template<typename _Handler>
63void csv_parser<_Handler>::row()
64{
65 m_handler.begin_row();
66 while (true)
67 {
68 if (is_text_qualifier(cur_char()))
69 quoted_cell();
70 else
71 cell();
72
73 if (!has_char())
74 {
75 m_handler.end_row();
76 return;
77 }
78
79 char c = cur_char();
80 if (c == '\n')
81 {
82 next();
83#if ORCUS_DEBUG_CSV
84 cout << "(LF)" << endl;
85#endif
86 m_handler.end_row();
87 return;
88 }
89
90 if (!is_delim(c))
91 throw orcus::csv::parse_error("expected a delimiter");
92
93 next();
94
95 if (m_config.trim_cell_value)
96 skip_blanks();
97
98 if (!has_char())
99 {
100 m_handler.end_row();
101 return;
102 }
103 }
104}
105
106template<typename _Handler>
107void csv_parser<_Handler>::cell()
108{
109 const char* p = mp_char;
110 size_t len = 0;
111 char c = cur_char();
112 while (c != '\n' && !is_delim(c))
113 {
114 ++len;
115 next();
116 if (!has_char())
117 break;
118 c = cur_char();
119 }
120
121 if (!len)
122 p = nullptr;
123
124 push_cell_value(p, len);
125}
126
127template<typename _Handler>
128void csv_parser<_Handler>::quoted_cell()
129{
130#if ORCUS_DEBUG_CSV
131 cout << "--- quoted cell" << endl;
132#endif
133 char c = cur_char();
134 assert(is_text_qualifier(c));
135 next(); // Skip the opening quote.
136 if (!has_char())
137 return;
138
139 const char* p0 = mp_char;
140 size_t len = 1;
141 for (; has_char(); next(), ++len)
142 {
143 c = cur_char();
144#if ORCUS_DEBUG_CSV
145 cout << "'" << c << "'" << endl;
146#endif
147 if (!is_text_qualifier(c))
148 continue;
149
150 // current char is a quote. Check if the next char is also a text
151 // qualifier.
152
153 if (has_next() && is_text_qualifier(next_char()))
154 {
155 next();
156 parse_cell_with_quote(p0, len);
157 return;
158 }
159
160 // Closing quote.
161 m_handler.cell(p0, len-1, false);
162 next();
163 skip_blanks();
164 return;
165 }
166
167 // Stream ended prematurely. Handle it gracefully.
168 m_handler.cell(p0, len, false);
169}
170
171template<typename _Handler>
172void csv_parser<_Handler>::parse_cell_with_quote(const char* p0, size_t len0)
173{
174#if ORCUS_DEBUG_CSV
175 using namespace std;
176 cout << "--- parse cell with quote" << endl;
177#endif
178 assert(is_text_qualifier(cur_char()));
179
180 // Push the preceding chars to the temp buffer.
181 m_cell_buf.reset();
182 m_cell_buf.append(p0, len0);
183
184 // Parse the rest, until the closing quote.
185 next();
186 const char* p_cur = mp_char;
187 size_t cur_len = 0;
188 for (; has_char(); next(), ++cur_len)
189 {
190 char c = cur_char();
191#if ORCUS_DEBUG_CSV
192 cout << "'" << c << "'" << endl;
193#endif
194 if (!is_text_qualifier(c))
195 continue;
196
197 if (has_next() && is_text_qualifier(next_char()))
198 {
199 // double quotation. Copy the current segment to the cell buffer.
200 m_cell_buf.append(p_cur, cur_len);
201
202 next(); // to the 2nd quote.
203 p_cur = mp_char;
204 cur_len = 0;
205 continue;
206 }
207
208 // closing quote. Flush the current segment to the cell
209 // buffer, push the value to the handler, and exit normally.
210 m_cell_buf.append(p_cur, cur_len);
211
212 m_handler.cell(m_cell_buf.get(), m_cell_buf.size(), true);
213 next();
214 skip_blanks();
215 return;
216 }
217
218 // Stream ended prematurely.
219 throw csv::parse_error("stream ended prematurely while parsing quoted cell.");
220}
221
222template<typename _Handler>
223void csv_parser<_Handler>::push_cell_value(const char* p, size_t n)
224{
225 size_t len = n;
226
227 if (m_config.trim_cell_value)
228 {
229 // Trim any leading blanks.
230 for (size_t i = 0; i < n; ++i, --len, ++p)
231 {
232 if (!is_blank(*p))
233 break;
234 }
235
236 // Trim any trailing blanks.
237 if (len)
238 {
239 const char* p_end = p + (len-1);
240 for (; p != p_end; --p_end, --len)
241 {
242 if (!is_blank(*p_end))
243 break;
244 }
245 }
246 }
247
248 m_handler.cell(p, len, false);
249#if ORCUS_DEBUG_CSV
250 if (len)
251 cout << "(cell:'" << std::string(p, len) << "')" << endl;
252 else
253 cout << "(cell:'')" << endl;
254#endif
255}
256
257}
258
259#endif
260/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: csv_parser_base.hpp:58
Definition: csv_parser_base.hpp:67
Definition: csv_parser.hpp:17
Definition: parser_base.hpp:40
Definition: config.hpp:19
Definition: csv_parser_base.hpp:37