/* This code implements Unicode SCSU encoding. It is a plug-in module to a larger transcoder system - anyone wanting to use it would have to write some surrounding support code. The heuristics used may be of general interest though. This document is dedicated to the public domain by its author, Kevin Bracey. As such, there is ABSOLUTELY NO WARRANTY, INCLUDING ANY IMPLIED WARRANTIES OF FITNESS OR MERCHANTABILITY. */ #include #include #include "iso10646.h" #include "encpriv.h" #include "enc_scsu.h" /* #define DEBUG_SCSU_COMPRESSION #define DEBUG_LEARNING */ #ifdef DEBUG_SCSU_COMPRESSION #include #endif /* * Routines for encoding SCSU (Standard Compression Scheme for Unicode) * Number: ??? * Names: x-SCSU */ /* Single Mode Tag Values */ #define SQ0 0x01 #define SQ1 0x02 #define SQ2 0x03 #define SQ3 0x04 #define SQ4 0x05 #define SQ5 0x06 #define SQ6 0x07 #define SQ7 0x08 #define SDX 0x0B #define SQU 0x0E #define SCU 0x0F #define SC0 0x10 #define SC1 0x11 #define SC2 0x12 #define SC3 0x13 #define SC4 0x14 #define SC5 0x15 #define SC6 0x16 #define SC7 0x17 #define SD0 0x18 #define SD1 0x19 #define SD2 0x1A #define SD3 0x1B #define SD4 0x1C #define SD5 0x1D #define SD6 0x1E #define SD7 0x1F #define IS_SINGLE_LITERAL(c) ((c)==0 || (c)==0x09 || (c)==0x0A || \ (c)==0x0D || ((c)>=0x20 && (c)<=0x7F)) /* Unicode Mode Tag Values */ #define UC0 0xE0 #define UC1 0xE1 #define UC2 0xE2 #define UC3 0xE3 #define UC4 0xE4 #define UC5 0xE5 #define UC6 0xE6 #define UC7 0xE7 #define UD0 0xE8 #define UD1 0xE9 #define UD2 0xEA #define UD3 0xEB #define UD4 0xEC #define UD5 0xED #define UD6 0xEE #define UD7 0xEF #define UQU 0xF0 #define UDX 0xF1 #define IS_UNICODE_LITERAL(c) ((c)<0xE0 || (c)>=0xF3) static const UCS2 static_window[8] = { 0x0000, /* C0 controls */ 0x0080, /* Latin-1 Supplement */ 0x0100, /* Latin Extended-A */ 0x0300, /* Combining Diacritical Marks */ 0x2000, /* General Punctuation */ 0x2080, /* Currency Symbols */ 0x2100, /* Letterlike Symbols and Number Forms */ 0x3000 /* CJK Symbols & Punctuation */ }; static const unsigned char default_dynamic_window[8] = { 0x01, /* 0080: Latin-1 */ 0xF9, /* 00C0: Latin-1 letters + half of Extended-A */ 0x08, /* 0400: Cyrillic */ 0x0C, /* 0600: Arabic */ 0x12, /* 0900: Devanagari */ 0xFD, /* 3040: Hiragana */ 0xFE, /* 30A0: Katakana */ 0xA6 /* FF00: Fullwidth ASCII */ }; /* * A guess at which default windows we're most likely to use. * From the most likely first: Latin-1 Supplement, Latin-1/Latin Ext-A, Cyrillic, Hiragana, * Katakana, Fullwidth ASCII, Arabic, Devanagari. */ static const char default_window_usage[8] = { 7, 6, 5, 1, 0, 4, 3, 2 }; /* * The special window offsets. */ static const UCS2 special_offset[7] = { 0x00C0, /* Latin-1 letters + half of Extended-A */ 0x0250, /* IPA Extensions */ 0x0370, /* Greek */ 0x0530, /* Armenian */ 0x3040, /* Hiragana */ 0x30A0, /* Katakana */ 0xFF60 /* Halfwidth Katakana */ }; #define SPECIAL_OFFSET_BASE 0xF9 static UCS4 window_offset(unsigned char x) { if (x >= 0x01 && x <= 0x67) return x * 0x80; else if (x >= 0x68 && x <= 0xA7) return x * 0x80 + 0xAC00; else if (x >= SPECIAL_OFFSET_BASE) return special_offset[x - SPECIAL_OFFSET_BASE]; else return 0xF0000; /* For lack of anything better to do */ } static int default_lock_to(unsigned char x) { /* * 256 bits, giving the default state for the lock_to flag for each possible * window position. This is set to 1 for the following windows: * 0080-0200, 0380-1100, 1200-1D80, 2800-2880, 3080-3180, E000-F880, * 00C0, 0250, 0370, 0530, 3040, 30A0, FF60, plus the * "reserved for future use" windows. */ static const unsigned long bits[8] = { 0xFFFFFF9E, /* 01111001111111111111111111111111 0080-0200, 0380-0F80 */ 0x0FFFFFF7, /* 11101111111111111111111111110000 1000-1100, 1200-1D80 */ 0x00030000, /* 00000000000000001100000000000000 2800-2880 */ 0xFFFFFF0E, /* 01110000111111111111111111111111 3080-3180, E000-EB80 */ 0x03FFFFFF, /* 11111111111111111111111111000000 EC00-F880 */ 0xFFFFFF00, /* 00000000111111111111111111111111 reserved (A8-BF) */ 0xFFFFFFFF, /* 11111111111111111111111111111111 reserved (C0-DF) */ 0xFFFFFFFF /* 11111111111111111111111111111111 reserved (E0-F8) + specials */ }; return (int)(bits[x >> 5] >> (x & 0x1F)) & 1; } typedef enum SCSU_State { SingleByte, DefiningWindow, DefiningExtendedWindow, DefiningExtendedWindowLSB, QuotingWindow, QuotingUnicode, QuotingUnicodeLSB, Unicode, UnicodeLSB, UnicodeQuoting } SCSU_State; typedef struct SCSU_Encoding { EncodingPriv e; UCS4 dynamic_window[8]; SCSU_State state; UCS4 window_base; unsigned char msb; char window_no; UCS2 surrogate; /* For writing only */ unsigned int count; unsigned int window_last_used[8]; UCS4 next; char lock_to[8]; char learning_from, learning_to, is_first; } SCSU_Encoding; static int scsu_reset(Encoding *e, int for_encoding) { SCSU_Encoding *se = (SCSU_Encoding *) e; int i; for (i=0; i<8; i++) se->dynamic_window[i] = window_offset(default_dynamic_window[i]); se->window_base = se->dynamic_window[0]; se->surrogate = 0; se->state = SingleByte; se->window_no = 0; if (for_encoding != encoding_READ) { se->count = 8; se->next = NULL_UCS4; se->learning_from = 8; se->is_first = 1; for (i=0; i<8; i++) { se->window_last_used[i] = default_window_usage[i]; /* Of the default dynamic windows, I reckon all are good to lock to * when there is no clear other reason, except window 7 (Fullwidth ASCII). */ se->lock_to[i] = default_lock_to(default_dynamic_window[i]); } } return 1; } static unsigned int scsu_read(Encoding *e, encoding_read_callback_fn ucs_out, const char *s, unsigned int n, void *handle) { SCSU_Encoding *se = (SCSU_Encoding *) e; unsigned int count; for (count = n; count; count--) { unsigned char c = *s++; UCS4 u; switch (se->state) { case SingleByte: switch (c) { case SQU: se->state = QuotingUnicode; continue; case SCU: se->state = Unicode; continue; case SQ0: case SQ1: case SQ2: case SQ3: case SQ4: case SQ5: case SQ6: case SQ7: se->state = QuotingWindow; se->window_no = c - SQ0; continue; case SC0: case SC1: case SC2: case SC3: case SC4: case SC5: case SC6: case SC7: se->window_base = se->dynamic_window[c - SC0]; continue; case SD0: case SD1: case SD2: case SD3: case SD4: case SD5: case SD6: case SD7: se->state = DefiningWindow; se->window_no = c - SD0; continue; case SDX: se->state = DefiningExtendedWindow; continue; case 0x0C: /* reserved */ u = 0xFFFD; break; default: if (c < 0x80) u = c; else u = se->window_base + c - 0x80; break; } break; case Unicode: switch (c) { case UQU: se->state = UnicodeQuoting; continue; case UC0: case UC1: case UC2: case UC3: case UC4: case UC5: case UC6: case UC7: se->state = SingleByte; se->window_base = se->dynamic_window[c - UC0]; continue; case UD0: case UD1: case UD2: case UD3: case UD4: case UD5: case UD6: case UD7: se->state = DefiningWindow; se->window_no = c - UD0; continue; case UDX: se->state = DefiningExtendedWindow; continue; case 0xF2: /* reserved */ u = 0xFFFD; break; default: se->state = UnicodeLSB; se->msb = c; continue; } break; case UnicodeLSB: se->state = Unicode; u = (se->msb << 8) | c; break; case UnicodeQuoting: se->state = UnicodeLSB; se->msb = c; continue; case QuotingUnicode: se->state = QuotingUnicodeLSB; se->msb = c; continue; case QuotingUnicodeLSB: se->state = SingleByte; u = (se->msb << 8) | c; break; case DefiningWindow: se->state = SingleByte; se->window_base = se->dynamic_window[se->window_no] = window_offset(c); continue; case DefiningExtendedWindow: se->state = DefiningExtendedWindowLSB; se->msb = c; continue; case DefiningExtendedWindowLSB: se->state = SingleByte; se->window_base = se->dynamic_window[se->msb >> 5] = 0x10000 + (((se->msb & 0x1F) << 8) | c) * 0x80; continue; case QuotingWindow: se->state = SingleByte; if (c < 0x80) u = static_window[se->window_no] + c; else u = se->dynamic_window[se->window_no] + c - 0x80; break; default: /* Should never happen */ se->state = SingleByte; u = 0xFFFD; break; } if (se->surrogate) { if (u < 0xDC00 || u >= 0xE000) u = 0xFFFD; else u = 0x10000 + ((se->surrogate - 0xD800) << 10) + u - 0xDC00; se->surrogate = 0; } else if (u >= 0xD800 && u < 0xDC00) { se->surrogate = u; continue; } else if (u >= 0xDC00 && u < 0xE000) u = 0xFFFD; if (ucs_out) if (ucs_out(handle, u)) break; } return n - count; } /* * What window offset should I use for this character? * Returns 0 if not accessible via a standard window. */ static int window_offset_for_character(UCS4 u) { /* * We'll try the special offsets first - if they exist, it's * probably because they're better than the standard offsets */ unsigned int i; for (i=SPECIAL_OFFSET_BASE; i<0x100u; i++) if (u - special_offset[i - SPECIAL_OFFSET_BASE] < 0x80u) return i; if (u >= 0x0080u && u < 0x3400u) return u / 0x80u; else if (u >= 0xE000u & u < 0x10000u) return (u - 0xAC00u) / 0x80u; else return 0; } /* * Find a dynamic window containing character (-1 if none) */ static int find_dynamic_window(SCSU_Encoding *se, UCS4 u) { int i, l = 0, n = -1; /* Use the most recently used window, if there is more than one * possibility. */ for (i=0; i<8; i++) if (u - se->dynamic_window[i] < 0x80u && se->window_last_used[i] >= l) l = se->window_last_used[n = i]; return n; } /* * Find a dynamic window containing characters "u" and "next". If not * possible, one containing the "u" (-1 if none) */ static int find_dynamic_window_with_next(SCSU_Encoding *se, UCS4 u, UCS4 next) { int i, l = 0, n = -1; /* Use the most recently used window, if there is more than one * possibility. */ for (i=0; i<8; i++) if (u - se->dynamic_window[i] < 0x80u) { int priority = se->window_last_used[i]; if (next - se->dynamic_window[i] < 0x80u) priority += 0x100000; if (priority >= l) l = priority, n = i; } return n; } /* Find a static window containing "u" */ static int find_static_window(UCS4 u) { int i; for (i=0; i<8; i++) if (u - static_window[i] < 0x80u) return i; return -1; } /* Find least recently used window */ static int lru_window(SCSU_Encoding *se) { int i, l = INT_MAX, n = 0; unsigned int *w = se->window_last_used; for (i=0; i<8; i++) if (w[i] < l) l = w[n = i]; return n; } /* * Is character "u" one of the window-independent ASCII/control * characters? */ static int is_valid_ascii(UCS4 u) { return u < 0x0080 && (u >= 0x0020 || u == 0x0000 || u == 0x0009 || u == 0x000A || u == 0x000D); } /* * Is character "u" expressible as a single byte if the current * window is "window"? */ static int is_single_byte(SCSU_Encoding *se, UCS4 u, int window) { return is_valid_ascii(u) || u - se->dynamic_window[window] < 0x80u; } static int output(int c, char **out, int *outsize) { if ((*outsize)-- > 0) { *(*out)++ = c; return 1; } else return 0; } /* * This compressor is an attempt to be optimal, given one character * look-ahead. We use a LRU-replacement strategy for windows. * * Uses the same members of SCSU_Encoding as reading, but we only use * states SingleByte and Unicode. */ static int scsu_write(EncodingPriv *e, UCS4 u, char **buf, int *bufsize) { SCSU_Encoding *se = (SCSU_Encoding *) e; SCSU_State state = se->state; int window_no = se->window_no; int ok = 0; int w; UCS4 next; #define OUTPUT(c) output(c, buf, bufsize) if (u == NULL_UCS4) /* Flush */ { #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "Flush\n"); #endif } else { /* Knock out illegal characters immediately */ if (u >= 0x110000 || u >= 0xD800 && u < 0xE000) { if (se->e.for_encoding == encoding_WRITE_STRICT) return -1; else u = 0xFFFD; } } /* Put this character into "next", then process the existing "next" */ next = u; u = se->next; /* * At this point, "next" is the next character (which the caller passed in) * and se->next = u is the character we're processing. If we can't output * the character, we will leave se->next unchanged so that when the caller * calls us again with "next", we will end up in the same place. * * BE AWARE THAT "next" COULD BE NULL_UCS4, INDICATING END OF STREAM */ if (u == NULL_UCS4) { se->next = next; return 0; } #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "%08X: ", u); #endif /* Special rule for byte-order mark (introduced in revision 3.1) - always * output using SQU. */ if (u == 0xFEFF && se->is_first) goto output_squ; /* Is it ASCII or a permitted control code? */ if (is_valid_ascii(u)) { if (state == Unicode) { /* Only switch to single byte if the next character is * expressible as a single byte. If that next character is * ASCII, switch to the previous window, otherwise to the * window the next character requires. */ w = window_no; if (is_valid_ascii(next) || (w = find_dynamic_window(se, next)) >= 0) { state = SingleByte; window_no = w; OUTPUT(UC0 + w); #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "UC%d ", window_no); #endif } else goto output_unicode; } ok = OUTPUT(u); #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "%02X\n", u); #endif } /* Is it in our current window? */ else if (state == SingleByte && u - se->dynamic_window[window_no] < 0x80u) { if (se->learning_from < 8) { #ifdef DEBUG_LEARNING { int old = se->lock_to[se->learning_to]; int new = (se->learning_to == window_no); fprintf(stderr, "{locking to %d is %s%s} ", se->learning_to, new != old ? "now " : "", new ? "good" : "bad"); } #endif se->lock_to[se->learning_to] = (se->learning_to == window_no); se->learning_from = 8; } ok = OUTPUT(0x80u + u - se->dynamic_window[window_no]); if (ok) se->window_last_used[window_no] = se->count++; #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "%02X\n", 0x80u + u - se->dynamic_window[window_no]); #endif } /* Is it in one of our dynamic windows? */ else if ((w = find_dynamic_window_with_next(se, u, next)) >= 0) { if (state == Unicode) { /* When's it worthwhile to switch? When the following character * is a single byte, I reckon. */ if (is_single_byte(se, next, w)) { window_no = w; state = SingleByte; OUTPUT(UC0 + w); #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "UC%d ", w); #endif } else goto output_unicode; } else { int lock; if (se->learning_from < 8) { if (se->learning_from == w) { #ifdef DEBUG_LEARNING fprintf(stderr, "{locking to %d is %sbad} ", se->learning_to, se->lock_to[se->learning_to] != 0 ? "now ": ""); #endif se->lock_to[se->learning_to] = 0; /* Shouldn't have switched */ } else if (se->learning_to == w) { #ifdef DEBUG_LEARNING fprintf(stderr, "{locking to %d is %sgood} ", se->learning_to, se->lock_to[se->learning_to] != 1 ? "now ": ""); #endif se->lock_to[se->learning_to] = 1; /* Should have switched */ } se->learning_from = 8; } /* * We'll do a locking shift if the next character is in the * new window. We'll do a single shift if the next character * is in the current window, but not the new one. * In the remaining cases (ie next character not in either new * or current window, including ASCII), we have a learning * function. */ if (next - se->dynamic_window[w] < 0x80) lock = 1; else if (next - se->dynamic_window[window_no] < 0x80) lock = 0; else { lock = se->lock_to[w]; se->learning_from = window_no; se->learning_to = w; #ifdef DEBUG_LEARNING fprintf(stderr, "{%s to %d} ", lock ? "locking" : "not locking", w); #endif } if (lock) { window_no = w; OUTPUT(SC0 + w); #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "SC%d ", w); #endif } else { /* We're doing a single shift. I reckon that if we're only * using a dynamic window for single shifts, and there's a * static window that could do the same job, we might as well * use that, leaving the dynamic window free to be replaced. */ int temp; if ((temp = find_static_window(u)) >= 0) { w = temp; goto output_static_quote; } OUTPUT(SQ0 + w); #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "SQ%d ", w); #endif } } #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "%02X\n", 0x80u + u - se->dynamic_window[w]); #endif ok = OUTPUT(0x80u + u - se->dynamic_window[w]); if (ok) se->window_last_used[w] = se->count++; } /* If in single-byte mode, is it in one of the static windows? */ else if (state == SingleByte && (w = find_static_window(u)) >= 0) { int temp; if (!is_single_byte(se, next, window_no) && (temp = window_offset_for_character(u)) != 0 && window_offset_for_character(next) == temp) { /* The next character becomes 1 byte if we instead define * a dynamic window for this character. Probably a good * cue to do so. */ w = temp; goto define_window; } output_static_quote: #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "SQ%d %02X\n", w, u - static_window[w]); #endif OUTPUT(SQ0 + w); ok = OUTPUT(u - static_window[w]); } /* Can we define a window to cover this character? (Don't bother if it's in a static window, which may be the case if in Unicode mode) */ else if ((w = window_offset_for_character(u)) != 0 && find_static_window(u) == -1) { UCS4 base; int temp; if (state == Unicode) { /* Don't bother if the next character can't be expressed * as a single byte after the define (ie the UDn + char + next * = 4 bytes, the same as staying in Unicode). */ if (!is_valid_ascii(next) /* Not valid ASCII / ctrl */ && next - window_offset(w) >= 0x80u) /* Not in new window */ { goto output_unicode; } } define_window: se->learning_from = 8; /* Make sure we don't splat the window we're going to need * for the next character */ if ((temp = find_dynamic_window(se, next)) >= 0) se->window_last_used[temp] = se->count+1; window_no = lru_window(se); if (state == Unicode) { state = SingleByte; OUTPUT(UD0 + window_no); #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "UD%d ", window_no); #endif } else { OUTPUT(SD0 + window_no); #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "SD%d ", window_no); #endif } OUTPUT(w); base = window_offset(w); #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "%02X %02X [Window %d = %04X]\n", w, 0x80u + u - base, window_no, base); #endif ok = OUTPUT(0x80u + u - base); if (ok) { /* * If next character is neither in this window, nor the * previous one, set up the learning variables. */ if (se->state == SingleByte && !(next - base < 0x80) && !(next - se->dynamic_window[se->window_no] < 0x80)) { se->learning_from = se->window_no; se->learning_to = window_no; } se->lock_to[window_no] = default_lock_to(w); se->dynamic_window[window_no] = base; se->window_last_used[window_no] = se->count++; } } /* Is it in the BMP? */ else if (u < 0x10000) { output_unicode: if (u >= 0xE000 && u < 0xF300) { if (state == Unicode) { #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "UQU "); #endif OUTPUT(UQU); } else { /* Logically, can't happen, as we would have defined a window to cover it. But you never know... */ output_squ: #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "SQU "); #endif OUTPUT(SQU); } } else { if (state == SingleByte) { if (is_single_byte(se, next, window_no)) { if (se->learning_from < 8 && next >= 0x80u) /* Single byte because our guessed window is correct... */ { #ifdef DEBUG_LEARNING { int old = se->lock_to[se->learning_to]; int new = (se->learning_to == window_no); fprintf(stderr, "{locking to %d is %s%s} ", se->learning_to, new != old ? "now " : "", new ? "good" : "bad"); } #endif se->lock_to[se->learning_to] = (se->learning_to == window_no); } #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "SQU "); #endif OUTPUT(SQU); } else { if (se->learning_from < 8) { if (next - se->dynamic_window[se->learning_from] < 0x80) { #ifdef DEBUG_LEARNING fprintf(stderr, "{locking to %d is %sbad} ", se->learning_to, se->lock_to[se->learning_to] != 0 ? "now " : ""); #endif se->lock_to[se->learning_to] = 0; /* Shouldn't have locked */ } else if (next - se->dynamic_window[se->learning_to] < 0x80) { #ifdef DEBUG_LEARNING fprintf(stderr, "{locking to %d is %sgood} ", se->learning_to, se->lock_to[se->learning_to] != 1 ? "now " : ""); #endif se->lock_to[se->learning_to] = 1; /* Should have locked */ } } state = Unicode; #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "SCU "); #endif OUTPUT(SCU); } } } se->learning_from = 8; #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "%02X %02X\n", u >> 8, u & 0xFF); #endif OUTPUT(u >> 8); ok = OUTPUT(u & 0xFF); } /* Is it in the SIP? */ else if (u >= 0x20000 && u < 0x30000) { /* It's in the Supplementary Plane for CJK Unified Ideographs. We'll * assume it's a bad thing to create a window, as these will tend to * be sparse. However, if we're in single-byte mode, getting in and * out of Unicode mode is so expensive that we might as well define * a window - even if it ends up never being reused, redefining it * again ends up at less cost as not defining it in the first * place. ("SDX nn nn cc SDn cc" vs "SCU hh hh ll ll UCn cc") * * So, we'll define a window unless either we are in Unicode mode, * or we have to switch into Unicode for the next character anyway. * * If we do come across one of these isolated in single-byte mode, * it's probably as part of some sort of code chart, in which case * window defining might be the right thing to do anyway. * * Exception: if the next character is in the same window as this * one, we will define the window, even if in Unicode mode. */ UCS4 h, l; /* Is the next character in the same window? Or alternatively, are * we in single-byte mode, and we don't have to switch to Unicode * for the next character? */ if ((u &~ 0x7F) == (next &~ 0x7F) || (state == SingleByte && (next < 0x80 || window_offset_for_character(next)))) goto define_extended_window; if (state == SingleByte) { /* Always do it as "SCU hh hh ll ll" rather than "SQU hh hh SQU ll ll" - * we can stay in Unicode, or get a choice of any window for the same * price as the pair of SQUs. */ state = Unicode; #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "SCU "); #endif OUTPUT(SCU); } h = 0xD800 + ((u - 0x10000) >> 10); l = 0xDC00 + (u & 0x3FF); se->learning_from = 8; #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "%02X %02X %02X %02X\n", h >> 8, h & 0xFF, l >> 8, l & 0xFF); #endif OUTPUT(h >> 8); OUTPUT(h & 0xFF); OUTPUT(l >> 8); ok = OUTPUT(l & 0xFF); } else /* u >= 0x10000 && u < 0x110000 && not in any window && not in SIP */ { /* We'll always do it by defining a window. This takes * 4 bytes, regardless of current mode. With 1 look-ahead, * we're assuming that putting it into a window is a * Good Thing. */ UCS4 base; define_extended_window: se->learning_from = 8; if (state == Unicode) { state = SingleByte; #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "UDX "); #endif OUTPUT(UDX); } else { #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "SDX "); #endif OUTPUT(SDX); } window_no = lru_window(se); base = u &~ 0x7F; #ifdef DEBUG_SCSU_COMPRESSION fprintf(stderr, "%02X %02X %02X [Window %d = %08X]\n", (window_no << 5) | ((base - 0x10000) >> 15), ((base - 0x10000) >> 7) & 0xFF, 0x80u + u - base, window_no, base); #endif OUTPUT((window_no << 5) | ((base - 0x10000) >> 15)); OUTPUT(((base - 0x10000) >> 7) & 0xFF); ok = OUTPUT(0x80u + u - base); if (ok) { if (se->state == SingleByte && !(next - base < 0x80) && !(next - se->dynamic_window[se->window_no])) { se->learning_from = se->window_no; se->learning_to = window_no; } se->lock_to[window_no] = 0; /* Assume we don't lock to extended windows */ se->dynamic_window[window_no] = base; se->window_last_used[window_no] = se->count++; } } if (ok) { se->window_no = window_no; se->state = state; se->next = next; se->is_first = 0; } return ok; } EncodingPriv enc_scsu = { scsu_read, scsu_reset, sizeof(SCSU_Encoding) - sizeof(EncodingPriv), 0, /* scsu_delete */ 0, scsu_write };