path: root/src/StringUtils.cpp



// StringUtils.cpp

// Implements the various string helper functions:

#include "Globals.h"

#if defined(ANDROID_NDK)
#include <ctype.h>
#endif

#ifdef _MSC_VER
	// Under MSVC, link to WinSock2 (needed by RawBEToUTF8's byteswapping)
	#pragma comment(lib, "ws2_32.lib")
#endif


AString & AppendVPrintf(AString & str, const char *format, va_list args)
{
	ASSERT(format != NULL);
	
	char buffer[2048];
	size_t len;
	#ifdef _MSC_VER
	// MS CRT provides secure printf that doesn't behave like in the C99 standard
	if ((len = _vsnprintf_s(buffer, ARRAYCOUNT(buffer), _TRUNCATE, format, args)) != -1)
	#else  // _MSC_VER
	if ((len = vsnprintf(buffer, ARRAYCOUNT(buffer), format, args)) < ARRAYCOUNT(buffer))
	#endif  // else _MSC_VER
	{
		// The result did fit into the static buffer
		str.append(buffer, len);
		return str;
	}
	
	// The result did not fit into the static buffer
	#ifdef _MSC_VER
	// for MS CRT, we need to calculate the result length
	len = _vscprintf(format, args);
	if (len == -1)
	{
		return str;
	}
	#endif  // _MSC_VER
	
	// Allocate a buffer and printf into it:
	str.resize(len + 1);
	// HACK: we're accessing AString's internal buffer in a way that is NOT guaranteed to always work. But it works on all STL implementations tested.
	// I can't think of any other way that is safe, doesn't allocate twice as much space as needed and doesn't use C++11 features like the move constructor
	#ifdef _MSC_VER
	vsprintf_s((char *)str.data(), len + 1, format, args);
	#else  // _MSC_VER
	vsnprintf((char *)str.data(), len + 1, format, args);
	#endif  // else _MSC_VER
	str.resize(len);
	return str;
}


AString & Printf(AString & str, const char * format, ...)
{
	str.clear();
	va_list args;
	va_start(args, format);
	std::string &retval = AppendVPrintf(str, format, args);
	va_end(args);
	return retval;
}


AString Printf(const char * format, ...)
{
	AString res;
	va_list args;
	va_start(args, format);
	AppendVPrintf(res, format, args);
	va_end(args);
	return res;
}


AString & AppendPrintf(AString &str, const char *format, ...)
{
	va_list args;
	va_start(args, format);
	std::string &retval = AppendVPrintf(str, format, args);
	va_end(args);
	return retval;
}


AStringVector StringSplit(const AString & str, const AString & delim)
{
	AStringVector results;
	size_t cutAt = 0;
	size_t Prev = 0;
	while ((cutAt = str.find_first_of(delim, Prev)) != str.npos)
	{
		results.push_back(str.substr(Prev, cutAt - Prev));
		Prev = cutAt + 1;
	}
	if (Prev < str.length())
	{
		results.push_back(str.substr(Prev));
	}
	return results;
}


AStringVector StringSplitAndTrim(const AString & str, const AString & delim)
{
	AStringVector results;
	size_t cutAt = 0;
	size_t Prev = 0;
	while ((cutAt = str.find_first_of(delim, Prev)) != str.npos)
	{
		results.push_back(TrimString(str.substr(Prev, cutAt - Prev)));
		Prev = cutAt + 1;
	}
	if (Prev < str.length())
	{
		results.push_back(TrimString(str.substr(Prev)));
	}
	return results;
}


AString TrimString(const AString & str)
{
	size_t len = str.length();
	size_t start = 0;
	while (start < len)
	{
		if (str[start] > 32)
		{
			break;
		}
		++start;
	}
	if (start == len)
	{
		return "";
	}
	
	size_t end = len;
	while (end >= start)
	{
		if (str[end] > 32)
		{
			break;
		}
		--end;
	}
	
	return str.substr(start, end - start + 1);
}


AString & StrToUpper(AString & s)
{
	AString::iterator i = s.begin();
	AString::iterator end = s.end();

	while (i != end)
	{
		*i = (char)toupper(*i);
		++i;
	}
	return s;
}


AString & StrToLower(AString & s)
{
	AString::iterator i = s.begin();
	AString::iterator end = s.end();

	while (i != end)
	{
		*i = (char)tolower(*i);
		++i;
	}
	return s;
}


int NoCaseCompare(const AString & s1, const AString & s2)
{
	#ifdef _MSC_VER
	// MSVC has stricmp that compares case-insensitive:
	return _stricmp(s1.c_str(), s2.c_str());
	#else 
	// Do it the hard way:
	AString s1Copy(s1);
	AString s2Copy(s2);
	return StrToUpper(s1Copy).compare(StrToUpper(s2Copy));
	#endif  // else _MSC_VER
}


unsigned int RateCompareString(const AString & s1, const AString & s2 )
{
	unsigned int MatchedLetters = 0;
	unsigned int s1Length = s1.length();

	if( s1Length > s2.length() ) return 0; // Definitely not a match

	for (unsigned int i = 0; i < s1Length; i++)
	{
		char c1 = (char)toupper( s1[i] );
		char c2 = (char)toupper( s2[i] );
		if( c1 == c2 )
		{
			++MatchedLetters;
		}
		else
		{
			break;
		}
	}
	return MatchedLetters;
}


void ReplaceString(AString & iHayStack, const AString & iNeedle, const AString & iReplaceWith)
{
	size_t pos1 = iHayStack.find(iNeedle);
	while (pos1 != AString::npos)
	{
		iHayStack.replace( pos1, iNeedle.size(), iReplaceWith);
		pos1 = iHayStack.find(iNeedle, pos1);
	}
}


// Converts a stream of BE shorts into UTF-8 string; returns a ref to a_UTF8
AString & RawBEToUTF8(short * a_RawData, int a_NumShorts, AString & a_UTF8)
{
	a_UTF8.clear();
	a_UTF8.reserve(3 * a_NumShorts / 2);  // a quick guess of the resulting size
	for (int i = 0; i < a_NumShorts; i++)
	{
		int c = ntohs(*(a_RawData + i));
		if (c < 0x80)
		{
			a_UTF8.push_back((char)c);
		}
		else if (c < 0x800)
		{
			a_UTF8.push_back((char)(192 + c / 64));
			a_UTF8.push_back((char)(128 + c % 64));
		}
		else if (c - 0xd800u < 0x800)
		{
			// Error, silently drop
		}
		else if (c < 0x10000)
		{
			a_UTF8.push_back((char)(224 + c / 4096));
			a_UTF8.push_back((char)(128 + c / 64 % 64));
			a_UTF8.push_back((char)(128 + c % 64));
		}
		else if (c < 0x110000)
		{
			a_UTF8.push_back((char)(240 + c / 262144));
			a_UTF8.push_back((char)(128 + c / 4096 % 64));
			a_UTF8.push_back((char)(128 + c / 64 % 64));
			a_UTF8.push_back((char)(128 + c % 64));
		}
		else
		{
			// Error, silently drop
		}
	}
	return a_UTF8;
}


// UTF-8 conversion code adapted from:
//  http://stackoverflow.com/questions/2867123/convert-utf-16-to-utf-8-under-windows-and-linux-in-c

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Begin of Unicode, Inc.'s code / information
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

/*
Notice from the original file:
* Copyright 2001-2004 Unicode, Inc.
* 
* Disclaimer
* 
* This source code is provided as is by Unicode, Inc. No claims are
* made as to fitness for any particular purpose. No warranties of any
* kind are expressed or implied. The recipient agrees to determine
* applicability of information provided. If this file has been
* purchased on magnetic or optical media from Unicode, Inc., the
* sole remedy for any claim will be exchange of defective media
* within 90 days of receipt.
* 
* Limitations on Rights to Redistribute This Code
* 
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form
* for internal or external distribution as long as this notice
* remains attached.
*/

#define UNI_MAX_BMP         0x0000FFFF
#define UNI_MAX_UTF16       0x0010FFFF
#define UNI_MAX_UTF32       0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
#define UNI_SUR_HIGH_START  0xD800
#define UNI_SUR_HIGH_END    0xDBFF
#define UNI_SUR_LOW_START   0xDC00
#define UNI_SUR_LOW_END     0xDFFF


static const char trailingBytesForUTF8[256] =
{
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};


static const unsigned int offsetsFromUTF8[6] =
{
	0x00000000UL, 0x00003080UL, 0x000E2080UL, 
	0x03C82080UL, 0xFA082080UL, 0x82082080UL
};


static bool isLegalUTF8(const unsigned char * source, int length)
{
	unsigned char a;
	const unsigned char * srcptr = source + length;
	switch (length)
	{
		default: return false;
		// Everything else falls through when "true"...
		case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
		case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
		case 2:
		{
			if ((a = (*--srcptr)) > 0xBF) return false;
			switch (*source)
			{
				// no fall-through in this inner switch
				case 0xE0: if (a < 0xA0) return false; break;
				case 0xED: if (a > 0x9F) return false; break;
				case 0xF0: if (a < 0x90) return false; break;
				case 0xF4: if (a > 0x8F) return false; break;
				default:   if (a < 0x80) return false;
			}
		}
		case 1: if (*source >= 0x80 && *source < 0xC2) return false;
	}
	if (*source > 0xF4) return false;
	return true;
}


AString & UTF8ToRawBEUTF16(const char * a_UTF8, size_t a_UTF8Length, AString & a_UTF16)
{
	a_UTF16.clear();
	a_UTF16.reserve(a_UTF8Length * 3);

	const unsigned char * source    = (const unsigned char*)a_UTF8;
	const unsigned char * sourceEnd = source + a_UTF8Length;
	const int halfShift  = 10;  // used for shifting by 10 bits
	const unsigned int halfBase = 0x0010000UL;
	const unsigned int halfMask = 0x3FFUL;

	while (source < sourceEnd)
	{
		unsigned int ch = 0;
		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
		if (source + extraBytesToRead >= sourceEnd)
		{
			return a_UTF16;
		}
		// Do this check whether lenient or strict
		if (!isLegalUTF8(source, extraBytesToRead + 1))
		{
			return a_UTF16;
			break;
		}
		
		// The cases all fall through. See "Note A" below.
		switch (extraBytesToRead)
		{
			case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
			case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
			case 3: ch += *source++; ch <<= 6;
			case 2: ch += *source++; ch <<= 6;
			case 1: ch += *source++; ch <<= 6;
			case 0: ch += *source++;
		}
		ch -= offsetsFromUTF8[extraBytesToRead];

		if (ch <= UNI_MAX_BMP)
		{
			// Target is a character <= 0xFFFF
			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
			{
				// UTF-16 surrogate values are illegal in UTF-32
				ch = ' ';
			}
			unsigned short v = htons((unsigned short)ch);
			a_UTF16.append((const char *)&v, 2);
		}
		else if (ch > UNI_MAX_UTF16)
		{
			// Invalid value, replace with a space
			unsigned short v = htons(' ');
			a_UTF16.append((const char *)&v, 2);
		}
		else
		{
			// target is a character in range 0xFFFF - 0x10FFFF.
			ch -= halfBase;
			unsigned short v1 = htons((ch >> halfShift) + UNI_SUR_HIGH_START);
			unsigned short v2 = htons((ch & halfMask) + UNI_SUR_LOW_START);
			a_UTF16.append((const char *)&v1, 2);
			a_UTF16.append((const char *)&v2, 2);
		}
	}
	return a_UTF16;
}

/* ---------------------------------------------------------------------

    Note A.
    The fall-through switches in UTF-8 reading code save a
    temp variable, some decrements & conditionals.  The switches
    are equivalent to the following loop:
    {
        int tmpBytesToRead = extraBytesToRead+1;
        do {
        ch += *source++;
        --tmpBytesToRead;
        if (tmpBytesToRead) ch <<= 6;
        } while (tmpBytesToRead > 0);
    }

   ---------------------------------------------------------------------
*/

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// End of Unicode, Inc.'s code / information
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////


#define HEX(x) ((x) > 9 ? (x) + 'A' - 10 : (x) + '0')

/**
format binary data this way:
00001234: 31 32 33 34 35 36 37 38 39 30 61 62 63 64 65 66    1234567890abcdef
*/
AString & CreateHexDump(AString & a_Out, const void * a_Data, int a_Size, int a_LineLength)
{
	ASSERT(a_LineLength <= 120);  // Due to using a fixed size line buffer; increase line[]'s size to lift this max
	char line[512];
	char * p;
	char * q;
	
	a_Out.reserve(a_Size / a_LineLength * (18 + 6 * a_LineLength));
	for (int i = 0; i < a_Size; i += a_LineLength)
	{
		int k = a_Size - i;
		if (k > a_LineLength)
		{
			k = a_LineLength;
		}
		#ifdef _MSC_VER
		// MSVC provides a "secure" version of sprintf()
		int Count = sprintf_s(line, sizeof(line), "%08x:", i);
		#else
		int Count = sprintf(line, "%08x:", i);
		#endif
		// Remove the terminating NULL / leftover garbage in line, after the sprintf-ed value
		memset(line + Count, 32, sizeof(line) - Count);
		p = line + 10;
		q = p + 2 + a_LineLength * 3 + 1;
		for (int j = 0; j < k; j++)
		{
			unsigned char c = ((unsigned char *)a_Data)[i + j];
			p[0] = HEX(c >> 4);
			p[1] = HEX(c & 0xf);
			p[2] = ' ';
			if (c >= ' ')
			{
				q[0] = (char)c;
			}
			else
			{
				q[0] = '.';
			}
			p += 3;
			q ++;
		}  // for j
		q[0] = '\n';
		q[1] = 0;
		a_Out.append(line);
	}  // for i
	return a_Out;
}


AString EscapeString(const AString & a_Message)
{
	AString EscapedMsg;
	size_t len = a_Message.size();
	size_t last = 0;
	EscapedMsg.reserve(len);
	for (size_t i = 0; i < len; i++)
	{
		char ch = a_Message[i];
		switch (ch)
		{
			case '\'':
			case '\"':
			case '\\':
			{
				if (i > last)
				{
					EscapedMsg.append(a_Message, last, i - last);
				}
				EscapedMsg.push_back('\\');
				EscapedMsg.push_back(ch);
				last = i + 1;
				break;
			}
		}  // switch (ch)
	}  // for i - a_Message[]
	if (len > last)
	{
		EscapedMsg.append(a_Message, last, len - last);
	}
	return EscapedMsg;
}


AString StripColorCodes(const AString & a_Message)
{
	AString res(a_Message);
	size_t idx = 0;
	while (true)
	{
		idx = res.find("\xc2\xa7", idx);
		if (idx == AString::npos)
		{
			return res;
		}
		res.erase(idx, 3);
	}
}


AString URLDecode(const AString & a_String)
{
	AString res;
	size_t len = a_String.length();
	res.reserve(len);
	for (size_t i = 0; i < len; i++)
	{
		char ch = a_String[i];
		if ((ch != '%') || (i > len - 3))
		{
			res.push_back(ch);
			continue;
		}
		// Decode the hex value:
		char hi = a_String[i + 1], lo = a_String[i + 2];
		if ((hi >= '0') && (hi <= '9'))
		{
			hi = hi - '0';
		}
		else if ((hi >= 'a') && (hi <= 'f'))
		{
			hi = hi - 'a' + 10;
		}
		else if ((hi >= 'A') && (hi <= 'F'))
		{
			hi = hi - 'F' + 10;
		}
		else
		{
			res.push_back(ch);
			continue;
		}
		if ((lo >= '0') && (lo <= '9'))
		{
			lo = lo - '0';
		}
		else if ((lo >= 'a') && (lo <= 'f'))
		{
			lo = lo - 'a' + 10;
		}
		else if ((lo >= 'A') && (lo <= 'F'))
		{
			lo = lo - 'A' + 10;
		}
		else
		{
			res.push_back(ch);
			continue;
		}
		res.push_back((hi << 4) | lo);
		i += 2;
	}  // for i - a_String[]
	return res;
}


AString ReplaceAllCharOccurrences(const AString & a_String, char a_From, char a_To)
{
	AString res(a_String);
	std::replace(res.begin(), res.end(), a_From, a_To);
	return res;
}


/// Converts one Hex character in a Base64 encoding into the data value
static inline int UnBase64(char c)
{
	if (c >='A' && c <= 'Z')
	{
		return c - 'A';
	}
	if (c >='a' && c <= 'z')
	{
		return c - 'a' + 26;
	}
	if (c >= '0' && c <= '9')
	{
		return c - '0' + 52;
	}
	if (c == '+')
	{
		return 62;
	}
	if (c == '/')
	{
		return 63;
	}
	if (c == '=')
	{
		return -1;
	}
	return -2;
}


AString Base64Decode(const AString & a_Base64String)
{
	AString res;
	size_t i, len = a_Base64String.size();
	int o, c;
	res.resize((len * 4) / 3 + 5, 0);  // Approximate the upper bound on the result length
	for (o = 0, i = 0; i < len; i++)
	{
		c = UnBase64(a_Base64String[i]);
		if (c >= 0)
		{
			switch (o & 7)
			{
				case 0:	res[o >> 3] |= (c << 2); break;
				case 6: res[o >> 3] |= (c >> 4); res[(o >> 3) + 1] |= (c << 4); break;
				case 4: res[o >> 3] |= (c >> 2); res[(o >> 3) + 1] |= (c << 6); break;
				case 2: res[o >> 3] |= c; break;
			}
			o += 6;
		}
		if (c == -1)
		{
			// Error while decoding, invalid input. Return as much as we've decoded:
			res.resize(o >> 3);
			return res;
		}
	}
	res.resize(o >> 3);
	return res;}


short GetBEShort(const char * a_Mem)
{
	return (((short)a_Mem[0]) << 8) | a_Mem[1];
}


int GetBEInt(const char * a_Mem)
{
	return (((int)a_Mem[0]) << 24) | (((int)a_Mem[1]) << 16) | (((int)a_Mem[2]) << 8) | a_Mem[3];
}


void SetBEInt(char * a_Mem, Int32 a_Value)
{
	a_Mem[0] = a_Value >> 24;
	a_Mem[1] = (a_Value >> 16) & 0xff;
	a_Mem[2] = (a_Value >> 8) & 0xff;
	a_Mem[3] = a_Value & 0xff;
}