Thea
StringAlg.cpp
1 //============================================================================
2 //
3 // This file is part of the Thea toolkit.
4 //
5 // This software is distributed under the BSD license, as detailed in the
6 // accompanying LICENSE.txt file. Portions are derived from other works:
7 // their respective licenses and copyright information are reproduced in
8 // LICENSE.txt and/or in the relevant source files.
9 //
10 // Author: Siddhartha Chaudhuri
11 // First version: 2013
12 //
13 //============================================================================
14 
15 /*
16  ORIGINAL HEADER
17 
18  @file stringutils.cpp
19 
20  @maintainer Morgan McGuire, http://graphics.cs.williams.edu
21 
22  @created 2000-09-09
23  @edited 2008-01-10
24 */
25 
26 #include "StringAlg.hpp"
27 #include <algorithm>
28 #include <cmath>
29 #include <cctype>
30 #include <cstdio>
31 #include <cstring>
32 
33 // Not cstdarg, to make sure we pull in the vsnprintf etc functions
34 #include <stdarg.h>
35 
36 #ifdef _WIN32
37  #include <windows.h>
38 #endif
39 
40 namespace Thea {
41 
42 #ifdef _MSC_VER
43 // disable: "C++ exception handler used"
44 # pragma warning (push)
45 # pragma warning (disable : 4530)
46 #endif
47 
48 #ifdef THEA_WINDOWS
49 
50 char const * NEWLINE = "\r\n";
51 
52 #else
53 
54 char const * NEWLINE = "\n";
55 
56 #endif
57 
58 void
59 parseCommaSeparated(std::string const & s, Array<std::string> & array, bool strip_quotes)
60 {
61  array.clear();
62 
63  if (s.empty())
64  return;
65 
66  size_t begin = 0;
67  char const delimiter = ',';
68  char const quote = '\"';
69 
70  do
71  {
72  size_t end = begin;
73  // Find the next comma, or the end of the string
74  bool in_quotes = false;
75 
76  while ((end < s.length()) && (in_quotes || (s[end] != delimiter)))
77  {
78  if (s[end] == quote)
79  {
80  if ((end < s.length() - 2) && (s[end + 1] == quote) && (s[end + 2]) == quote)
81  {
82  // Skip over the superquote
83  end += 2;
84  }
85 
86  in_quotes = ! in_quotes;
87  }
88 
89  ++end;
90  }
91 
92  array.push_back(s.substr(begin, end - begin));
93  begin = end + 1;
94 
95  } while (begin < s.length());
96 
97  if (strip_quotes)
98  {
99  for (size_t i = 0; i < array.size(); ++i)
100  {
101  std::string & t = array[i];
102  size_t L = t.length();
103 
104  if ((L > 1) && (t[0] == quote) && (t[L - 1] == quote))
105  {
106  if ((L > 6) && (t[1] == quote) && (t[2] == quote) && (t[L - 3] == quote) && (t[L - 2] == quote))
107  {
108  // Triple-quote
109  t = t.substr(3, L - 6);
110  }
111  else
112  {
113  // Double-quote
114  t = t.substr(1, L - 2);
115  }
116  }
117  }
118  }
119 }
120 
121 bool
122 beginsWith(std::string const & test, std::string const & pattern)
123 {
124  if (test.size() >= pattern.size())
125  {
126  for (size_t i = 0; i < pattern.size(); ++i)
127  {
128  if (pattern[i] != test[i])
129  {
130  return false;
131  }
132  }
133 
134  return true;
135  }
136  else
137  {
138  return false;
139  }
140 }
141 
142 bool
143 endsWith(std::string const & test, std::string const & pattern)
144 {
145  if (test.size() >= pattern.size())
146  {
147  intx te = (intx)test.size() - 1;
148  intx pe = (intx)pattern.size() - 1;
149 
150  for (intx i = (intx)pattern.size() - 1; i >= 0; --i)
151  {
152  if (pattern[(size_t)(pe - i)] != test[(size_t)(te - i)])
153  {
154  return false;
155  }
156  }
157 
158  return true;
159  }
160  else
161  {
162  return false;
163  }
164 }
165 
166 std::string
167 wordWrap(std::string const & input, intx num_cols, char const * newline)
168 {
169  alwaysAssertM(newline, "wordWrap: Newline string cannot be a null pointer");
170 
171  std::ostringstream output;
172 
173  size_t c = 0;
174  intx len;
175  // Don't make lines less than this length
176  intx min_length = num_cols / 4;
177  size_t in_len = input.size();
178  bool first = true;
179 
180  while (c < in_len)
181  {
182  if (first)
183  {
184  first = false;
185  }
186  else
187  {
188  output << newline;
189  }
190 
191  if ((intx)in_len - (intx)c - 1 < num_cols)
192  {
193  // The end
194  output << input.substr(c, in_len - c);
195  break;
196  }
197 
198  len = num_cols;
199 
200  // Look at character c + num_cols, see if it is a space.
201  while ((len > min_length) &&
202  (input[(size_t)(c + len)] != ' '))
203  {
204  len--;
205  }
206 
207  if (len == min_length)
208  {
209  // Just crop
210  len = num_cols;
211  }
212 
213  output << input.substr(c, len);
214  c += len;
215 
216  if (c < input.size())
217  {
218  // Collapse multiple spaces.
219  while ((input[c] == ' ') && (c < input.size()))
220  {
221  c++;
222  }
223  }
224  }
225 
226  return output.str();
227 }
228 
229 std::string
230 toUpper(std::string const & s)
231 {
232  std::string result = s;
233  std::transform(result.begin(), result.end(), result.begin(), ::toupper);
234  return result;
235 }
236 
237 std::string
238 toLower(std::string const & s)
239 {
240  std::string result = s;
241  std::transform(result.begin(), result.end(), result.begin(), ::tolower);
242  return result;
243 }
244 
245 intx
246 stringSplit(std::string const & s, char split_char, Array<std::string> & result, bool skip_empty_fields)
247 {
248  return stringSplit(s, std::string(1, split_char), result, skip_empty_fields);
249 }
250 
251 intx
252 stringSplit(std::string const & s, std::string const & split_chars, Array<std::string> & result, bool skip_empty_fields)
253 {
254  result.clear();
255 
256  // Pointer to delimiters
257  char const * delims = split_chars.c_str();
258 
259  // Pointers to the beginning and end of the substring
260  char const * start = s.c_str();
261  char const * stop = start;
262 
263  while ((stop = std::strpbrk(start, delims)))
264  {
265  if (!skip_empty_fields || stop != start)
266  result.push_back(std::string(start, stop - start));
267 
268  start = stop + 1;
269  }
270 
271  // Append the last one
272  std::string last = std::string(start);
273  if (!skip_empty_fields || !last.empty())
274  result.push_back(last);
275 
276  return (intx)result.size();
277 }
278 
279 std::string
280 trimWhitespace(std::string const & s)
281 {
282  size_t left = 0;
283 
284  // Trim from left
285  while ((left < s.length()) && isWhitespace(s[left]))
286  {
287  ++left;
288  }
289 
290  intx right = (intx)s.length() - 1;
291 
292  // Trim from right
293  while ((right > (intx)left) && isWhitespace(s[(size_t)right]))
294  {
295  --right;
296  }
297 
298  return s.substr(left, (size_t)(right - (intx)left + 1));
299 }
300 
301 // If your platform does not have vsnprintf, you can find a
302 // implementation at http://www.ijs.si/software/snprintf/
303 
304 std::string THEA_CDECL
305 format(char const * fmt, ...)
306 {
307  va_list arg_list;
308  va_start(arg_list, fmt);
309  std::string result = vformat(fmt, arg_list);
310  va_end(arg_list);
311 
312  return result;
313 }
314 
315 #if defined(_MSC_VER) && (_MSC_VER >= 1300)
316 
317 // Both MSVC seems to use the non-standard vsnprintf
318 // so we are using vscprintf to determine buffer size, however
319 // only MSVC7 and up headers include vscprintf for some reason.
320 std::string
321 vformat(char const * fmt, va_list arg_ptr)
322 {
323  alwaysAssertM(fmt, "vformat: Format string cannot be a null pointer");
324 
325  // We draw the line at a 1MB string.
326  static intx const MAX_SIZE = 1000000;
327 
328  // If the string is less than 161 characters, allocate it on the stack because this saves the malloc/free time.
329  static intx const BUF_SIZE = 161;
330 
331  // MSVC does not support va_copy
332  intx actual_size = _vscprintf(fmt, arg_ptr) + 1;
333 
334  if (actual_size > BUF_SIZE)
335  {
336  // Now use the heap.
337  char * heap_buffer = nullptr;
338 
339  if (actual_size < MAX_SIZE)
340  {
341  heap_buffer = (char *)std::malloc(MAX_SIZE + 1);
342  _vsnprintf(heap_buffer, MAX_SIZE, fmt, arg_ptr);
343  heap_buffer[MAX_SIZE] = '\0';
344  }
345  else
346  {
347  heap_buffer = (char *)std::malloc(actual_size);
348  vsprintf(heap_buffer, fmt, arg_ptr);
349  }
350 
351  std::string formatted_string(heap_buffer);
352  std::free(heap_buffer);
353  return formatted_string;
354  }
355  else
356  {
357  char stack_buffer[BUF_SIZE];
358  vsprintf(stack_buffer, fmt, arg_ptr);
359  return std::string(stack_buffer);
360  }
361 }
362 
363 #elif defined(_MSC_VER) && (_MSC_VER < 1300)
364 
365 std::string
366 vformat(char const * fmt, va_list arg_ptr)
367 {
368  alwaysAssertM(fmt, "vformat: Format string cannot be a null pointer");
369 
370  // We draw the line at a 1MB string.
371  static intx const MAX_SIZE = 1000000;
372 
373  // If the string is less than 161 characters,
374  // allocate it on the stack because this saves
375  // the malloc/free time.
376  static intx const BUF_SIZE = 161;
377  char stack_buffer[BUF_SIZE];
378 
379  // MSVC6 doesn't support va_copy, however it also seems to compile
380  // correctly if we just pass our argument list along. Note that
381  // this whole code block is only compiled if we're on MSVC6 anyway
382  intx actual_written = _vsnprintf(stack_buffer, BUF_SIZE, fmt, arg_ptr);
383 
384  // Not a big enough buffer, BUF_SIZE characters written
385  if (actual_written == -1)
386  {
387  intx heap_size = 512;
388  double pow_size = 1.0;
389  char * heap_buffer = (char *)std::malloc(heap_size);
390 
391  while ((_vsnprintf(heap_buffer, heap_size, fmt, arg_ptr) == -1) && (heap_size < MAX_SIZE))
392  {
393  heap_size = std::ceil(heap_size * std::pow(2.0, pow_size++));
394  heap_buffer = (char *)std::realloc(heap_buffer, heap_size);
395  }
396 
397  heap_buffer[heap_size - 1] = '\0';
398 
399  std::string heap_string(heap_buffer);
400  std::free(heap_buffer);
401 
402  return heap_string;
403  }
404  else
405  {
406  return std::string(stack_buffer);
407  }
408 }
409 
410 #else
411 
412 // glibc 2.1 has been updated to the C99 standard
413 std::string
414 vformat(char const * fmt, va_list arg_ptr)
415 {
416  alwaysAssertM(fmt, "vformat: Format string cannot be a null pointer");
417 
418  // If the string is less than 161 characters,
419  // allocate it on the stack because this saves
420  // the malloc/free time. The number 161 is chosen
421  // to support two lines of text on an 80 character
422  // console (plus the null terminator).
423  static intx const BUF_SIZE = 161;
424  char stack_buffer[BUF_SIZE];
425 
426  va_list arg_ptr_copy;
427  va_copy(arg_ptr_copy, arg_ptr);
428  intx num_chars = vsnprintf(stack_buffer, BUF_SIZE, fmt, arg_ptr_copy);
429  va_end(arg_ptr_copy);
430 
431  if (num_chars >= BUF_SIZE)
432  {
433  // We didn't allocate a big enough string.
434  char * heap_buffer = (char *)std::malloc(num_chars + 1);
435 
436  debugAssertM(heap_buffer, "vformat: Heap buffer allocation failed");
437  intx num_chars2 = vsnprintf(heap_buffer, num_chars + 1, fmt, arg_ptr);
438  debugAssertM(num_chars2 == num_chars, "vformat: Number of characters written does not match expected value");
439  (void)num_chars2; // avoid unused variable warnings
440 
441  std::string result(heap_buffer);
442  std::free(heap_buffer);
443 
444  return result;
445  }
446  else
447  {
448  return std::string(stack_buffer);
449  }
450 }
451 
452 #endif
453 
454 #define THEA_EOS '\0'
455 #define THEA_RANGE_MATCH 1
456 #define THEA_RANGE_NOMATCH 0
457 #define THEA_RANGE_ERROR (-1)
458 
459 namespace StringAlgInternal {
460 
461 static int
462 rangeMatch(char const * pattern, char test, int flags, char ** newp)
463 {
464  int negate, ok;
465  char c, c2;
466 
467  /*
468  * A bracket expression starting with an unquoted circumflex
469  * character produces unspecified results (IEEE 1003.2-1992,
470  * 3.13.2). This implementation treats it like '!', for
471  * consistency with the regular expression syntax.
472  * J.T. Conklin (conklin@ngai.kaleida.com)
473  */
474  if ((negate = (*pattern == '!' || *pattern == '^')))
475  ++pattern;
476 
477  if (flags & FNM_CASEFOLD)
478  test = ::tolower((unsigned char)test);
479 
480  /*
481  * A right bracket shall lose its special meaning and represent
482  * itself in a bracket expression if it occurs first in the list.
483  * -- POSIX.2 2.8.3.2
484  */
485  ok = 0;
486  c = *pattern++;
487 
488  do
489  {
490  if (c == '\\' && !(flags & FNM_NOESCAPE))
491  c = *pattern++;
492 
493  if (c == THEA_EOS)
494  return (THEA_RANGE_ERROR);
495 
496  if (c == '/' && (flags & FNM_PATHNAME))
497  return (THEA_RANGE_NOMATCH);
498 
499  if ((flags & FNM_CASEFOLD))
500  c = ::tolower((unsigned char)c);
501 
502  if (*pattern == '-'
503  && (c2 = *(pattern + 1)) != THEA_EOS && c2 != ']')
504  {
505  pattern += 2;
506 
507  if (c2 == '\\' && !(flags & FNM_NOESCAPE))
508  c2 = *pattern++;
509 
510  if (c2 == THEA_EOS)
511  return (THEA_RANGE_ERROR);
512 
513  if (flags & FNM_CASEFOLD)
514  c2 = ::tolower((unsigned char)c2);
515 
516  if (c <= test && test <= c2)
517  ok = 1;
518  }
519  else if (c == test)
520  ok = 1;
521  }
522  while ((c = *pattern++) != ']');
523 
524  *newp = (char *)pattern;
525  return (ok == negate ? THEA_RANGE_NOMATCH : THEA_RANGE_MATCH);
526 }
527 
528 int
529 fnmatch(char const * pattern, char const * query, int flags)
530 {
531  alwaysAssertM(pattern && query, "fnmatch: Pattern and query strings cannot be null pointers");
532 
533  char const * stringstart;
534  char * newp;
535  char c, test;
536 
537  for (stringstart = query;;)
538  {
539  switch (c = *pattern++)
540  {
541  case THEA_EOS:
542  if ((flags & FNM_LEADING_DIR) && *query == '/')
543  return (0);
544 
545  return (*query == THEA_EOS ? 0 : FNM_NOMATCH);
546 
547  case '?':
548  if (*query == THEA_EOS)
549  return (FNM_NOMATCH);
550 
551  if (*query == '/' && (flags & FNM_PATHNAME))
552  return (FNM_NOMATCH);
553 
554  if (*query == '.' && (flags & FNM_PERIOD) &&
555  (query == stringstart ||
556  ((flags & FNM_PATHNAME) && *(query - 1) == '/')))
557  return (FNM_NOMATCH);
558 
559  ++query;
560  break;
561 
562  case '*':
563  c = *pattern;
564 
565  /* Collapse multiple stars. */
566  while (c == '*')
567  c = *++pattern;
568 
569  if (*query == '.' && (flags & FNM_PERIOD) &&
570  (query == stringstart ||
571  ((flags & FNM_PATHNAME) && *(query - 1) == '/')))
572  return (FNM_NOMATCH);
573 
574  /* Optimize for pattern with * at end or before /. */
575  if (c == THEA_EOS)
576  {
577  if (flags & FNM_PATHNAME)
578  return ((flags & FNM_LEADING_DIR) ||
579  std::strchr(query, '/') == nullptr ?
580  0 : FNM_NOMATCH);
581  else
582  return (0);
583  }
584  else if (c == '/' && (flags & FNM_PATHNAME))
585  {
586  if ((query = std::strchr(query, '/')) == nullptr)
587  return (FNM_NOMATCH);
588 
589  break;
590  }
591 
592  /* General case, use recursion. */
593  while ((test = *query) != THEA_EOS)
594  {
595  if (fnmatch(pattern, query, flags & ~FNM_PERIOD) == 0)
596  return (0);
597 
598  if (test == '/' && (flags & FNM_PATHNAME))
599  break;
600 
601  ++query;
602  }
603 
604  return (FNM_NOMATCH);
605 
606  case '[':
607  if (*query == THEA_EOS)
608  return (FNM_NOMATCH);
609 
610  if (*query == '/' && (flags & FNM_PATHNAME))
611  return (FNM_NOMATCH);
612 
613  if (*query == '.' && (flags & FNM_PERIOD) &&
614  (query == stringstart ||
615  ((flags & FNM_PATHNAME) && *(query - 1) == '/')))
616  return (FNM_NOMATCH);
617 
618  switch (rangeMatch(pattern, *query, flags, &newp))
619  {
620  case THEA_RANGE_ERROR:
621  /* not a good range, treat as normal text */
622  goto fnmatch_normal;
623 
624  case THEA_RANGE_MATCH:
625  pattern = newp;
626  break;
627 
628  case THEA_RANGE_NOMATCH:
629  return (FNM_NOMATCH);
630  }
631 
632  ++query;
633  break;
634 
635  case '\\':
636  if (!(flags & FNM_NOESCAPE))
637  {
638  if ((c = *pattern++) == THEA_EOS)
639  {
640  c = '\\';
641  --pattern;
642  }
643  }
644 
645  /* FALLTHROUGH */
646  default:
647 fnmatch_normal:
648  if (c != *query && !((flags & FNM_CASEFOLD) &&
649  (::tolower((unsigned char)c) ==
650  ::tolower((unsigned char)*query))))
651  return (FNM_NOMATCH);
652 
653  ++query;
654  break;
655  }
656  }
657 
658  /* NOTREACHED */
659  return -1;
660 }
661 
662 } // namespace StringAlgInternal
663 
664 bool
665 patternMatch(std::string const & pattern, std::string const & query, int flags)
666 {
667  int status = StringAlgInternal::fnmatch(pattern.c_str(), query.c_str(), flags);
668 
669  switch (status)
670  {
671  case 0: return true;
672  case FNM_NOMATCH: return false;
673  default: throw Error("patternMatch: Invalid pattern");
674  }
675 }
676 
677 } // namespace Thea
678 
679 #ifdef _MSC_VER
680 # pragma warning (pop)
681 #endif
bool patternMatch(std::string const &pattern, std::string const &query, int flags)
Compares a filename or pathname to a pattern.
Definition: StringAlg.cpp:665
std::ptrdiff_t intx
A signed integer suitable for indexing a structure held in memory.
Definition: Platform.hpp:161
bool beginsWith(std::string const &test, std::string const &pattern)
Check if the test string begins with the pattern string.
Definition: StringAlg.cpp:122
Root namespace for the Thea library.
void parseCommaSeparated(std::string const &s, Array< std::string > &array, bool strip_quotes)
Separates a comma-separated line, properly escaping commas within double quotes (") and super quotes ...
Definition: StringAlg.cpp:59
bool isWhitespace(char c)
Check if a character is a whitespace character.
bool endsWith(std::string const &test, std::string const &pattern)
Check if the test string ends with the pattern string.
Definition: StringAlg.cpp:143
std::string format(char const *fmt,...)
Produces a string from arguments in the style of printf.
Definition: StringAlg.cpp:305
std::string vformat(char const *fmt, va_list arg_ptr)
Produces a string from arguments in the style of printf, can be called with the argument list from a ...
Definition: StringAlg.cpp:414
std::string wordWrap(std::string const &input, intx num_cols, char const *newline=NEWLINE)
Produces a new string that is the input string wrapped at a certain number of columns (where the line...
Definition: StringAlg.cpp:167
std::string trimWhitespace(std::string const &s)
Strips whitespace from both ends of the string.
Definition: StringAlg.cpp:280
std::string toLower(std::string const &s)
Get the lowercase version of a string.
Definition: StringAlg.cpp:238
char const * NEWLINE
The newline character sequence for the current platform.
Definition: StringAlg.cpp:54
intx stringSplit(std::string const &s, char split_char, Array< std::string > &result, bool skip_empty_fields)
Split a string at each occurrence of a splitting character.
Definition: StringAlg.cpp:246
std::runtime_error Error
An error class.
Definition: Error.hpp:27
std::vector< T, Alloc > Array
Dynamically resizable array.
Definition: Array.hpp:25
std::string toUpper(std::string const &s)
Get the uppercase version of a string.
Definition: StringAlg.cpp:230
void alwaysAssertM(CondT const &test, MessageT const &msg)
Check if a test condition is true, and immediately abort the program with an error code if not...
Definition: Common.hpp:66
void debugAssertM(CondT const &test, MessageT const &msg)
Check if a test condition is true, and immediately abort the program with an error code if not...
Definition: Common.hpp:52