summaryrefslogtreecommitdiffstats
path: root/src/StringUtils.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/StringUtils.cpp')
-rw-r--r--src/StringUtils.cpp766
1 files changed, 766 insertions, 0 deletions
diff --git a/src/StringUtils.cpp b/src/StringUtils.cpp
new file mode 100644
index 000000000..f7aeeed26
--- /dev/null
+++ b/src/StringUtils.cpp
@@ -0,0 +1,766 @@
+
+// StringUtils.cpp
+
+// Implements the various string helper functions:
+
+#include "Globals.h"
+
+#if defined(ANDROID_NDK)
+#include <ctype.h>
+#endif
+
+#ifdef _MSC_VER
+ // Under MSVC, link to WinSock2 (needed by RawBEToUTF8's byteswapping)
+ #pragma comment(lib, "ws2_32.lib")
+#endif
+
+
+
+
+
+AString & AppendVPrintf(AString & str, const char *format, va_list args)
+{
+ ASSERT(format != NULL);
+
+ char buffer[2048];
+ size_t len;
+ #ifdef _MSC_VER
+ // MS CRT provides secure printf that doesn't behave like in the C99 standard
+ if ((len = _vsnprintf_s(buffer, ARRAYCOUNT(buffer), _TRUNCATE, format, args)) != -1)
+ #else // _MSC_VER
+ if ((len = vsnprintf(buffer, ARRAYCOUNT(buffer), format, args)) < ARRAYCOUNT(buffer))
+ #endif // else _MSC_VER
+ {
+ // The result did fit into the static buffer
+ str.append(buffer, len);
+ return str;
+ }
+
+ // The result did not fit into the static buffer
+ #ifdef _MSC_VER
+ // for MS CRT, we need to calculate the result length
+ len = _vscprintf(format, args);
+ if (len == -1)
+ {
+ return str;
+ }
+ #endif // _MSC_VER
+
+ // Allocate a buffer and printf into it:
+ str.resize(len + 1);
+ // HACK: we're accessing AString's internal buffer in a way that is NOT guaranteed to always work. But it works on all STL implementations tested.
+ // I can't think of any other way that is safe, doesn't allocate twice as much space as needed and doesn't use C++11 features like the move constructor
+ #ifdef _MSC_VER
+ vsprintf_s((char *)str.data(), len + 1, format, args);
+ #else // _MSC_VER
+ vsnprintf((char *)str.data(), len + 1, format, args);
+ #endif // else _MSC_VER
+ str.resize(len);
+ return str;
+}
+
+
+
+
+
+AString & Printf(AString & str, const char * format, ...)
+{
+ str.clear();
+ va_list args;
+ va_start(args, format);
+ std::string &retval = AppendVPrintf(str, format, args);
+ va_end(args);
+ return retval;
+}
+
+
+
+
+
+AString Printf(const char * format, ...)
+{
+ AString res;
+ va_list args;
+ va_start(args, format);
+ AppendVPrintf(res, format, args);
+ va_end(args);
+ return res;
+}
+
+
+
+
+
+AString & AppendPrintf(AString &str, const char *format, ...)
+{
+ va_list args;
+ va_start(args, format);
+ std::string &retval = AppendVPrintf(str, format, args);
+ va_end(args);
+ return retval;
+}
+
+
+
+
+
+AStringVector StringSplit(const AString & str, const AString & delim)
+{
+ AStringVector results;
+ size_t cutAt = 0;
+ size_t Prev = 0;
+ while ((cutAt = str.find_first_of(delim, Prev)) != str.npos)
+ {
+ results.push_back(str.substr(Prev, cutAt - Prev));
+ Prev = cutAt + 1;
+ }
+ if (Prev < str.length())
+ {
+ results.push_back(str.substr(Prev));
+ }
+ return results;
+}
+
+
+
+
+
+AStringVector StringSplitAndTrim(const AString & str, const AString & delim)
+{
+ AStringVector results;
+ size_t cutAt = 0;
+ size_t Prev = 0;
+ while ((cutAt = str.find_first_of(delim, Prev)) != str.npos)
+ {
+ results.push_back(TrimString(str.substr(Prev, cutAt - Prev)));
+ Prev = cutAt + 1;
+ }
+ if (Prev < str.length())
+ {
+ results.push_back(TrimString(str.substr(Prev)));
+ }
+ return results;
+}
+
+
+
+
+AString TrimString(const AString & str)
+{
+ size_t len = str.length();
+ size_t start = 0;
+ while (start < len)
+ {
+ if (str[start] > 32)
+ {
+ break;
+ }
+ ++start;
+ }
+ if (start == len)
+ {
+ return "";
+ }
+
+ size_t end = len;
+ while (end >= start)
+ {
+ if (str[end] > 32)
+ {
+ break;
+ }
+ --end;
+ }
+
+ return str.substr(start, end - start + 1);
+}
+
+
+
+
+
+AString & StrToUpper(AString & s)
+{
+ AString::iterator i = s.begin();
+ AString::iterator end = s.end();
+
+ while (i != end)
+ {
+ *i = (char)toupper(*i);
+ ++i;
+ }
+ return s;
+}
+
+
+
+
+
+AString & StrToLower(AString & s)
+{
+ AString::iterator i = s.begin();
+ AString::iterator end = s.end();
+
+ while (i != end)
+ {
+ *i = (char)tolower(*i);
+ ++i;
+ }
+ return s;
+}
+
+
+
+
+
+int NoCaseCompare(const AString & s1, const AString & s2)
+{
+ #ifdef _MSC_VER
+ // MSVC has stricmp that compares case-insensitive:
+ return _stricmp(s1.c_str(), s2.c_str());
+ #else
+ // Do it the hard way:
+ AString s1Copy(s1);
+ AString s2Copy(s2);
+ return StrToUpper(s1Copy).compare(StrToUpper(s2Copy));
+ #endif // else _MSC_VER
+}
+
+
+
+
+
+unsigned int RateCompareString(const AString & s1, const AString & s2 )
+{
+ unsigned int MatchedLetters = 0;
+ unsigned int s1Length = s1.length();
+
+ if( s1Length > s2.length() ) return 0; // Definitely not a match
+
+ for (unsigned int i = 0; i < s1Length; i++)
+ {
+ char c1 = (char)toupper( s1[i] );
+ char c2 = (char)toupper( s2[i] );
+ if( c1 == c2 )
+ {
+ ++MatchedLetters;
+ }
+ else
+ {
+ break;
+ }
+ }
+ return MatchedLetters;
+}
+
+
+
+
+
+void ReplaceString(AString & iHayStack, const AString & iNeedle, const AString & iReplaceWith)
+{
+ size_t pos1 = iHayStack.find(iNeedle);
+ while (pos1 != AString::npos)
+ {
+ iHayStack.replace( pos1, iNeedle.size(), iReplaceWith);
+ pos1 = iHayStack.find(iNeedle, pos1);
+ }
+}
+
+
+
+
+// Converts a stream of BE shorts into UTF-8 string; returns a ref to a_UTF8
+AString & RawBEToUTF8(short * a_RawData, int a_NumShorts, AString & a_UTF8)
+{
+ a_UTF8.clear();
+ a_UTF8.reserve(3 * a_NumShorts / 2); // a quick guess of the resulting size
+ for (int i = 0; i < a_NumShorts; i++)
+ {
+ int c = ntohs(*(a_RawData + i));
+ if (c < 0x80)
+ {
+ a_UTF8.push_back((char)c);
+ }
+ else if (c < 0x800)
+ {
+ a_UTF8.push_back((char)(192 + c / 64));
+ a_UTF8.push_back((char)(128 + c % 64));
+ }
+ else if (c - 0xd800u < 0x800)
+ {
+ // Error, silently drop
+ }
+ else if (c < 0x10000)
+ {
+ a_UTF8.push_back((char)(224 + c / 4096));
+ a_UTF8.push_back((char)(128 + c / 64 % 64));
+ a_UTF8.push_back((char)(128 + c % 64));
+ }
+ else if (c < 0x110000)
+ {
+ a_UTF8.push_back((char)(240 + c / 262144));
+ a_UTF8.push_back((char)(128 + c / 4096 % 64));
+ a_UTF8.push_back((char)(128 + c / 64 % 64));
+ a_UTF8.push_back((char)(128 + c % 64));
+ }
+ else
+ {
+ // Error, silently drop
+ }
+ }
+ return a_UTF8;
+}
+
+
+
+
+// UTF-8 conversion code adapted from:
+// http://stackoverflow.com/questions/2867123/convert-utf-16-to-utf-8-under-windows-and-linux-in-c
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Begin of Unicode, Inc.'s code / information
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+Notice from the original file:
+* Copyright 2001-2004 Unicode, Inc.
+*
+* Disclaimer
+*
+* This source code is provided as is by Unicode, Inc. No claims are
+* made as to fitness for any particular purpose. No warranties of any
+* kind are expressed or implied. The recipient agrees to determine
+* applicability of information provided. If this file has been
+* purchased on magnetic or optical media from Unicode, Inc., the
+* sole remedy for any claim will be exchange of defective media
+* within 90 days of receipt.
+*
+* Limitations on Rights to Redistribute This Code
+*
+* Unicode, Inc. hereby grants the right to freely use the information
+* supplied in this file in the creation of products supporting the
+* Unicode Standard, and to make copies of this file in any form
+* for internal or external distribution as long as this notice
+* remains attached.
+*/
+
+#define UNI_MAX_BMP 0x0000FFFF
+#define UNI_MAX_UTF16 0x0010FFFF
+#define UNI_MAX_UTF32 0x7FFFFFFF
+#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
+#define UNI_SUR_HIGH_START 0xD800
+#define UNI_SUR_HIGH_END 0xDBFF
+#define UNI_SUR_LOW_START 0xDC00
+#define UNI_SUR_LOW_END 0xDFFF
+
+
+
+
+
+static const char trailingBytesForUTF8[256] =
+{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
+};
+
+
+
+
+
+static const unsigned int offsetsFromUTF8[6] =
+{
+ 0x00000000UL, 0x00003080UL, 0x000E2080UL,
+ 0x03C82080UL, 0xFA082080UL, 0x82082080UL
+};
+
+
+
+
+
+static bool isLegalUTF8(const unsigned char * source, int length)
+{
+ unsigned char a;
+ const unsigned char * srcptr = source + length;
+ switch (length)
+ {
+ default: return false;
+ // Everything else falls through when "true"...
+ case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 2:
+ {
+ if ((a = (*--srcptr)) > 0xBF) return false;
+ switch (*source)
+ {
+ // no fall-through in this inner switch
+ case 0xE0: if (a < 0xA0) return false; break;
+ case 0xED: if (a > 0x9F) return false; break;
+ case 0xF0: if (a < 0x90) return false; break;
+ case 0xF4: if (a > 0x8F) return false; break;
+ default: if (a < 0x80) return false;
+ }
+ }
+ case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+ }
+ if (*source > 0xF4) return false;
+ return true;
+}
+
+
+
+
+
+AString & UTF8ToRawBEUTF16(const char * a_UTF8, size_t a_UTF8Length, AString & a_UTF16)
+{
+ a_UTF16.clear();
+ a_UTF16.reserve(a_UTF8Length * 3);
+
+ const unsigned char * source = (const unsigned char*)a_UTF8;
+ const unsigned char * sourceEnd = source + a_UTF8Length;
+ const int halfShift = 10; // used for shifting by 10 bits
+ const unsigned int halfBase = 0x0010000UL;
+ const unsigned int halfMask = 0x3FFUL;
+
+ while (source < sourceEnd)
+ {
+ unsigned int ch = 0;
+ unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+ if (source + extraBytesToRead >= sourceEnd)
+ {
+ return a_UTF16;
+ }
+ // Do this check whether lenient or strict
+ if (!isLegalUTF8(source, extraBytesToRead + 1))
+ {
+ return a_UTF16;
+ break;
+ }
+
+ // The cases all fall through. See "Note A" below.
+ switch (extraBytesToRead)
+ {
+ case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+ case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+ case 3: ch += *source++; ch <<= 6;
+ case 2: ch += *source++; ch <<= 6;
+ case 1: ch += *source++; ch <<= 6;
+ case 0: ch += *source++;
+ }
+ ch -= offsetsFromUTF8[extraBytesToRead];
+
+ if (ch <= UNI_MAX_BMP)
+ {
+ // Target is a character <= 0xFFFF
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
+ {
+ // UTF-16 surrogate values are illegal in UTF-32
+ ch = ' ';
+ }
+ unsigned short v = htons((unsigned short)ch);
+ a_UTF16.append((const char *)&v, 2);
+ }
+ else if (ch > UNI_MAX_UTF16)
+ {
+ // Invalid value, replace with a space
+ unsigned short v = htons(' ');
+ a_UTF16.append((const char *)&v, 2);
+ }
+ else
+ {
+ // target is a character in range 0xFFFF - 0x10FFFF.
+ ch -= halfBase;
+ unsigned short v1 = htons((ch >> halfShift) + UNI_SUR_HIGH_START);
+ unsigned short v2 = htons((ch & halfMask) + UNI_SUR_LOW_START);
+ a_UTF16.append((const char *)&v1, 2);
+ a_UTF16.append((const char *)&v2, 2);
+ }
+ }
+ return a_UTF16;
+}
+
+/* ---------------------------------------------------------------------
+
+ Note A.
+ The fall-through switches in UTF-8 reading code save a
+ temp variable, some decrements & conditionals. The switches
+ are equivalent to the following loop:
+ {
+ int tmpBytesToRead = extraBytesToRead+1;
+ do {
+ ch += *source++;
+ --tmpBytesToRead;
+ if (tmpBytesToRead) ch <<= 6;
+ } while (tmpBytesToRead > 0);
+ }
+
+ ---------------------------------------------------------------------
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// End of Unicode, Inc.'s code / information
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+#define HEX(x) ((x) > 9 ? (x) + 'A' - 10 : (x) + '0')
+
+/**
+format binary data this way:
+00001234: 31 32 33 34 35 36 37 38 39 30 61 62 63 64 65 66 1234567890abcdef
+*/
+AString & CreateHexDump(AString & a_Out, const void * a_Data, int a_Size, int a_LineLength)
+{
+ ASSERT(a_LineLength <= 120); // Due to using a fixed size line buffer; increase line[]'s size to lift this max
+ char line[512];
+ char * p;
+ char * q;
+
+ a_Out.reserve(a_Size / a_LineLength * (18 + 6 * a_LineLength));
+ for (int i = 0; i < a_Size; i += a_LineLength)
+ {
+ int k = a_Size - i;
+ if (k > a_LineLength)
+ {
+ k = a_LineLength;
+ }
+ #ifdef _MSC_VER
+ // MSVC provides a "secure" version of sprintf()
+ int Count = sprintf_s(line, sizeof(line), "%08x:", i);
+ #else
+ int Count = sprintf(line, "%08x:", i);
+ #endif
+ // Remove the terminating NULL / leftover garbage in line, after the sprintf-ed value
+ memset(line + Count, 32, sizeof(line) - Count);
+ p = line + 10;
+ q = p + 2 + a_LineLength * 3 + 1;
+ for (int j = 0; j < k; j++)
+ {
+ unsigned char c = ((unsigned char *)a_Data)[i + j];
+ p[0] = HEX(c >> 4);
+ p[1] = HEX(c & 0xf);
+ p[2] = ' ';
+ if (c >= ' ')
+ {
+ q[0] = (char)c;
+ }
+ else
+ {
+ q[0] = '.';
+ }
+ p += 3;
+ q ++;
+ } // for j
+ q[0] = '\n';
+ q[1] = 0;
+ a_Out.append(line);
+ } // for i
+ return a_Out;
+}
+
+
+
+
+
+AString EscapeString(const AString & a_Message)
+{
+ AString EscapedMsg;
+ size_t len = a_Message.size();
+ size_t last = 0;
+ EscapedMsg.reserve(len);
+ for (size_t i = 0; i < len; i++)
+ {
+ char ch = a_Message[i];
+ switch (ch)
+ {
+ case '\'':
+ case '\"':
+ case '\\':
+ {
+ if (i > last)
+ {
+ EscapedMsg.append(a_Message, last, i - last);
+ }
+ EscapedMsg.push_back('\\');
+ EscapedMsg.push_back(ch);
+ last = i + 1;
+ break;
+ }
+ } // switch (ch)
+ } // for i - a_Message[]
+ if (len > last)
+ {
+ EscapedMsg.append(a_Message, last, len - last);
+ }
+ return EscapedMsg;
+}
+
+
+
+
+
+AString StripColorCodes(const AString & a_Message)
+{
+ AString res(a_Message);
+ size_t idx = 0;
+ while (true)
+ {
+ idx = res.find("\xc2\xa7", idx);
+ if (idx == AString::npos)
+ {
+ return res;
+ }
+ res.erase(idx, 3);
+ }
+}
+
+
+
+
+
+AString URLDecode(const AString & a_String)
+{
+ AString res;
+ size_t len = a_String.length();
+ res.reserve(len);
+ for (size_t i = 0; i < len; i++)
+ {
+ char ch = a_String[i];
+ if ((ch != '%') || (i > len - 3))
+ {
+ res.push_back(ch);
+ continue;
+ }
+ // Decode the hex value:
+ char hi = a_String[i + 1], lo = a_String[i + 2];
+ if ((hi >= '0') && (hi <= '9'))
+ {
+ hi = hi - '0';
+ }
+ else if ((hi >= 'a') && (hi <= 'f'))
+ {
+ hi = hi - 'a' + 10;
+ }
+ else if ((hi >= 'A') && (hi <= 'F'))
+ {
+ hi = hi - 'F' + 10;
+ }
+ else
+ {
+ res.push_back(ch);
+ continue;
+ }
+ if ((lo >= '0') && (lo <= '9'))
+ {
+ lo = lo - '0';
+ }
+ else if ((lo >= 'a') && (lo <= 'f'))
+ {
+ lo = lo - 'a' + 10;
+ }
+ else if ((lo >= 'A') && (lo <= 'F'))
+ {
+ lo = lo - 'A' + 10;
+ }
+ else
+ {
+ res.push_back(ch);
+ continue;
+ }
+ res.push_back((hi << 4) | lo);
+ i += 2;
+ } // for i - a_String[]
+ return res;
+}
+
+
+
+
+
+AString ReplaceAllCharOccurrences(const AString & a_String, char a_From, char a_To)
+{
+ AString res(a_String);
+ std::replace(res.begin(), res.end(), a_From, a_To);
+ return res;
+}
+
+
+
+
+
+/// Converts one Hex character in a Base64 encoding into the data value
+static inline int UnBase64(char c)
+{
+ if (c >='A' && c <= 'Z')
+ {
+ return c - 'A';
+ }
+ if (c >='a' && c <= 'z')
+ {
+ return c - 'a' + 26;
+ }
+ if (c >= '0' && c <= '9')
+ {
+ return c - '0' + 52;
+ }
+ if (c == '+')
+ {
+ return 62;
+ }
+ if (c == '/')
+ {
+ return 63;
+ }
+ if (c == '=')
+ {
+ return -1;
+ }
+ return -2;
+}
+
+
+
+
+
+AString Base64Decode(const AString & a_Base64String)
+{
+ AString res;
+ size_t i, len = a_Base64String.size();
+ int o, c;
+ res.resize((len * 4) / 3 + 5, 0); // Approximate the upper bound on the result length
+ for (o = 0, i = 0; i < len; i++)
+ {
+ c = UnBase64(a_Base64String[i]);
+ if (c >= 0)
+ {
+ switch (o & 7)
+ {
+ case 0: res[o >> 3] |= (c << 2); break;
+ case 6: res[o >> 3] |= (c >> 4); res[(o >> 3) + 1] |= (c << 4); break;
+ case 4: res[o >> 3] |= (c >> 2); res[(o >> 3) + 1] |= (c << 6); break;
+ case 2: res[o >> 3] |= c; break;
+ }
+ o += 6;
+ }
+ if (c == -1)
+ {
+ // Error while decoding, invalid input. Return as much as we've decoded:
+ res.resize(o >> 3);
+ return res;
+ }
+ }
+ res.resize(o >> 3);
+ return res;}
+
+
+
+