bstrlib-fixed/src/buniutil.c

/*
 * This source file is part of the bstring string library.  This code was
 * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
 * license and the GPL. Refer to the accompanying documentation for details
 * on usage and license.
 */

/*
 * buniutil.c
 *
 * This file is not necessarily part of the core bstring library itself, but
 * is just an implementation of basic utf8 processing for bstrlib.  Note that
 * this module is dependent upon bstrlib.c and utf8util.c
 */

#include "bstrlib.h"
#include "buniutil.h"

#define UNICODE__CODE_POINT__REPLACEMENT_CHARACTER (0xFFFDL)

/*  int buIsUTF8Content (const_bstring bu)
 *
 *  Scan string and return 1 if its entire contents is entirely UTF8 code
 *  points.  Otherwise return 0.
 */
int buIsUTF8Content (const_bstring bu) {
struct utf8Iterator iter;

	if (NULL == bdata (bu)) return 0;
	for (utf8IteratorInit (&iter, bu->data, bu->slen);
	     iter.next < iter.slen;) {
		if (0 >= utf8IteratorGetNextCodePoint (&iter, -1)) return 0;
	}
	return 1;
}

/*  int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu,
 *                     int pos)
 *
 *  Convert a string of UTF8 codepoints (bu) skipping the first pos, into a
 *  sequence of UTF16 encoded code points.  Returns the number of UCS2 16-bit
 *  words written to the output.  No more than len words are written to the
 *  target array ucs2.  If any code point in bu is unparsable, it will be
 *  translated to errCh.
 */
int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos) {
struct tagbstring t;
struct utf8Iterator iter;
cpUcs4 ucs4;
int i, j;

	if (!isLegalUnicodeCodePoint (errCh)) errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
	if (NULL == ucs2 || 0 >= len || NULL == bdata (bu) || 0 > pos) return BSTR_ERR;

	for (j=0, i=0; j < bu->slen; j++) {
		if (0x80 != (0xC0 & bu->data[j])) {
			if (i >= pos) break;
			i++;
		}
	}

	t.mlen = -1;
	t.data = bu->data + j;
	t.slen = bu->slen - j;

	utf8IteratorInit (&iter, t.data, t.slen);

	ucs4 = BSTR_ERR;
	for (i=0; 0 < len && iter.next < iter.slen &&
	          0 <= (ucs4 = utf8IteratorGetNextCodePoint (&iter, errCh)); i++) {
		if (ucs4 < 0x10000) {
			*ucs2++ = (cpUcs2) ucs4;
			len--;
		} else {
			if (len < 2) {
				*ucs2++ = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
				len--;
			} else {
				long y = ucs4 - 0x10000;
				ucs2[0] = (cpUcs2) (0xD800 | (y >> 10));
				ucs2[1] = (cpUcs2) (0xDC00 | (y & 0x03FF));
				len -= 2;
				ucs2 += 2;
				i++;
			}
		}
	}
	while (0 < len) {
		*ucs2++ = 0;
		len--;
	}

	utf8IteratorUninit (&iter);
	if (0 > ucs4) return BSTR_ERR;
	return i;
}

/*

Unicode                   UTF-8
-------                   -----
U-00000000 - U-0000007F:  0xxxxxxx
U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

UTF-32: U-000000 - U-10FFFF

*/

/*  int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh)
 *
 *  Convert an array of UCS4 code points (bu) to UTF8 codepoints b.  Any
 *  invalid code point is replaced by errCh.  If errCh is itself not a
 *  valid code point, then this translation will halt upon the first error
 *  and return BSTR_ERR.  Otherwise BSTR_OK is returned.
 */
int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh) {
int i, oldSlen;

	if (NULL == bu || NULL == b || 0 > len || 0 > (oldSlen = blengthe (b, -1))) return BSTR_ERR;
	if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0;

	for (i=0; i < len; i++) {
		unsigned char c[6];
		cpUcs4 v = bu[i];

		if (!isLegalUnicodeCodePoint (v)) {
			if (~0 == errCh) {
				b->slen = oldSlen;
				return BSTR_ERR;
			}
			v = errCh;
		}

		if (v < 0x80) {
			if (BSTR_OK != bconchar (b, (char) v)) {
				b->slen = oldSlen;
				return BSTR_ERR;
			}
		} else if (v < 0x800) {
			c[0] = (unsigned char) ( (v >>  6)         + 0xc0);
			c[1] = (unsigned char) ((        v & 0x3f) + 0x80);
			if (BSTR_OK != bcatblk (b, c, 2)) {
				b->slen = oldSlen;
				return BSTR_ERR;
			}
		} else if (v < 0x10000) {
			c[0] = (unsigned char) ( (v >> 12)         + 0xe0);
			c[1] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
			c[2] = (unsigned char) ((        v & 0x3f) + 0x80);
			if (BSTR_OK != bcatblk (b, c, 3)) {
				b->slen = oldSlen;
				return BSTR_ERR;
			}
		} else
#if 0
			if (v < 0x200000)
#endif
		{
			c[0] = (unsigned char) ( (v >> 18)         + 0xf0);
			c[1] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
			c[2] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
			c[3] = (unsigned char) ((        v & 0x3f) + 0x80);
			if (BSTR_OK != bcatblk (b, c, 4)) {
				b->slen = oldSlen;
				return BSTR_ERR;
			}
		}
#if 0
		else if (v < 0x4000000) {
			c[0] = (unsigned char) ( (v >> 24)         + 0xf8);
			c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
			c[2] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
			c[3] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
			c[4] = (unsigned char) ((        v & 0x3f) + 0x80);
			if (BSTR_OK != bcatblk (b, c, 5)) {
				b->slen = oldSlen;
				return BSTR_ERR;
			}
		} else {
			c[0] = (unsigned char) ( (v >> 30)         + 0xfc);
			c[1] = (unsigned char) (((v >> 24) & 0x3f) + 0x80);
			c[2] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
			c[3] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
			c[4] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
			c[5] = (unsigned char) ((        v & 0x3f) + 0x80);
			if (BSTR_OK != bcatblk (b, c, 6)) {
				b->slen = oldSlen;
				return BSTR_ERR;
			}
		}
#endif
	}
	return BSTR_OK;
}

#define endSwap(cs,mode) ((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs))
#define TEMP_UCS4_BUFFER_SIZE (64)

/*  int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len,
 *                        cpUcs2* bom, cpUcs4 errCh)
 *
 *  Append an array of UCS2 code points (utf16) to UTF8 codepoints (bu).  Any
 *  invalid code point is replaced by errCh.  If errCh is itself not a
 *  valid code point, then this translation will halt upon the first error
 *  and return BSTR_ERR.  Otherwise BSTR_OK is returned.  If a byte order mark
 *  has been previously read, it may be passed in as bom, otherwise if *bom is
 *  set to 0, it will be filled in with the BOM as read from the first
 *  character if it is a BOM.
 */
int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh) {
cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE];
int cc, i, sm, oldSlen;

	if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR;
	if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0;
	if (len == 0) return BSTR_OK;

	oldSlen = bu->slen;
	i = 0;

	/* Check for BOM character and select endianess.  Also remove the
	   BOM from the stream, since there is no need for it in a UTF-8 encoding. */
	if (bom && (cpUcs2) 0xFFFE == *bom) {
		sm = 8;
	} else if (bom && (cpUcs2) 0xFEFF == *bom) {
		sm = 0;
	} else if (utf16[i] == (cpUcs2) 0xFFFE) {
		if (bom) *bom = utf16[i];
		sm = 8;
		i++;
	} else if (utf16[i] == (cpUcs2) 0xFEFF) {
		if (bom) *bom = utf16[i];
		sm = 0;
		i++;
	} else {
		sm = 0; /* Assume local endianness. */
	}

	cc = 0;
	for (;i < len; i++) {
		cpUcs4 c, v;
		v = endSwap (utf16[i], sm);

		if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */
			if (v >= 0xDC00 || i >= len) {
				ErrMode:;
				if (~0 == errCh) {
					ErrReturn:;
					bu->slen = oldSlen;
					return BSTR_ERR;
				}
				v = errCh;
			} else {
				i++;
				if ((c = endSwap (utf16[i], sm) - 0xDC00) > 0x3FF) goto ErrMode;
				v = ((v - 0xD800) << 10) + c + 0x10000;
			}
		}
		buff[cc] = v;
		cc++;
		if (cc >= TEMP_UCS4_BUFFER_SIZE) {
			if (0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn;
			cc = 0;
		}
	}
	if (cc > 0 && 0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn;

	return BSTR_OK;
}