?? arabicshaping.java
字號:
/********************************************************************************* Copyright (C) 2001, International Business Machines* Corporation and others. All Rights Reserved.********************************************************************************/package com.lowagie.text.pdf;//import com.ibm.icu.lang.*;/** * Shape Arabic text on a character basis. * * <p>ArabicShaping performs basic operations for "shaping" Arabic text. It is most * useful for use with legacy data formats and legacy display technology * (simple terminals). All operations are performed on Unicode characters.</p> * * <p>Text-based shaping means that some character code points in the text are * replaced by others depending on the context. It transforms one kind of text * into another. In comparison, modern displays for Arabic text select * appropriate, context-dependent font glyphs for each text element, which means * that they transform text into a glyph vector.</p> * * <p>Text transformations are necessary when modern display technology is not * available or when text needs to be transformed to or from legacy formats that * use "shaped" characters. Since the Arabic script is cursive, connecting * adjacent letters to each other, computers select images for each letter based * on the surrounding letters. This usually results in four images per Arabic * letter: initial, middle, final, and isolated forms. In Unicode, on the other * hand, letters are normally stored abstract, and a display system is expected * to select the necessary glyphs. (This makes searching and other text * processing easier because the same letter has only one code.) It is possible * to mimic this with text transformations because there are characters in * Unicode that are rendered as letters with a specific shape * (or cursive connectivity). They were included for interoperability with * legacy systems and codepages, and for unsophisticated display systems.</p> * * <p>A second kind of text transformations is supported for Arabic digits: * For compatibility with legacy codepages that only include European digits, * it is possible to replace one set of digits by another, changing the * character code points. These operations can be performed for either * Arabic-Indic Digits (U+0660...U+0669) or Eastern (Extended) Arabic-Indic * digits (U+06f0...U+06f9).</p> * * <p>Some replacements may result in more or fewer characters (code points). * By default, this means that the destination buffer may receive text with a * length different from the source length. Some legacy systems rely on the * length of the text to be constant. They expect extra spaces to be added * or consumed either next to the affected character or at the end of the * text.</p> */public final class ArabicShaping { private final int options; private boolean isLogical; // convenience /** * Convert a range of text in the source array, putting the result * into a range of text in the destination array, and return the number * of characters written. * * @param source An array containing the input text * @param sourceStart The start of the range of text to convert * @param sourceLength The length of the range of text to convert * @param dest The destination array that will receive the result. * It may be <code>NULL</code> only if <code>destSize</code> is 0. * @param destStart The start of the range of the destination buffer to use. * @param destSize The size (capacity) of the destination buffer. * If <code>destSize</code> is 0, then no output is produced, * but the necessary buffer size is returned ("preflighting"). This * does not validate the text against the options, for example, * if letters are being unshaped, and spaces are being consumed * following lamalef, this will not detect a lamalef without a * corresponding space. An error will be thrown when the actual * conversion is attempted. * @return The number of chars written to the destination buffer. * If an error occurs, then no output was written, or it may be * incomplete. * @if the text cannot be converted according to the options. */ public int shape(char[] source, int sourceStart, int sourceLength, char[] dest, int destStart, int destSize) { if (source == null) { throw new IllegalArgumentException("source can not be null"); } if (sourceStart < 0 || sourceLength < 0 || sourceStart + sourceLength > source.length) { throw new IllegalArgumentException("bad source start (" + sourceStart + ") or length (" + sourceLength + ") for buffer of length " + source.length); } if (dest == null && destSize != 0) { throw new IllegalArgumentException("null dest requires destSize == 0"); } if ((destSize != 0) && (destStart < 0 || destSize < 0 || destStart + destSize > dest.length)) { throw new IllegalArgumentException("bad dest start (" + destStart + ") or size (" + destSize + ") for buffer of length " + dest.length); } return internalShape(source, sourceStart, sourceLength, dest, destStart, destSize); } /** * Convert a range of text in place. This may only be used if the Length option * does not grow or shrink the text. * * @param source An array containing the input text * @param start The start of the range of text to convert * @param length The length of the range of text to convert * @if the text cannot be converted according to the options. */ public void shape(char[] source, int start, int length) { if ((options & LENGTH_MASK) == LENGTH_GROW_SHRINK) { throw new RuntimeException("Cannot shape in place with length option grow/shrink."); } shape(source, start, length, source, start, length); } /** * Convert a string, returning the new string. * * @param source The string to convert. * @return The converted string. * @if the string cannot be converted according to the options. */ public String shape(String text) { char[] src = text.toCharArray(); char[] dest = src; if (((options & LENGTH_MASK) == LENGTH_GROW_SHRINK) && ((options & LETTERS_MASK) == LETTERS_UNSHAPE)) { dest = new char[src.length * 2]; // max } int len = shape(src, 0, src.length, dest, 0, dest.length); return new String(dest, 0, len); } /** * Construct ArabicShaping using the options flags. * The flags are as follows:<br> * 'LENGTH' flags control whether the text can change size, and if not, * how to maintain the size of the text when LamAlef ligatures are * formed or broken.<br> * 'TEXT_DIRECTION' flags control whether the text is read and written * in visual order or in logical order.<br> * 'LETTERS_SHAPE' flags control whether conversion is to or from * presentation forms.<br> * 'DIGITS' flags control whether digits are shaped, and whether from * European to Arabic-Indic or vice-versa.<br> * 'DIGIT_TYPE' flags control whether standard or extended Arabic-Indic * digits are used when performing digit conversion. */ public ArabicShaping(int options) { this.options = options; if ((options & DIGITS_MASK) > 0x80) { throw new IllegalArgumentException("bad DIGITS options"); } isLogical = (options & TEXT_DIRECTION_MASK) == TEXT_DIRECTION_LOGICAL; } /** * Memory option: allow the result to have a different length than the source. */ public static final int LENGTH_GROW_SHRINK = 0; /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces next to modified characters. */ public static final int LENGTH_FIXED_SPACES_NEAR = 1; /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces at the end of the text. */ public static final int LENGTH_FIXED_SPACES_AT_END = 2; /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces at the beginning of the text. */ public static final int LENGTH_FIXED_SPACES_AT_BEGINNING = 3; /** * Bit mask for memory options. */ public static final int LENGTH_MASK = 3; /** * Direction indicator: the source is in logical (keyboard) order. */ public static final int TEXT_DIRECTION_LOGICAL = 0; /** * Direction indicator: the source is in visual (display) order, that is, * the leftmost displayed character is stored first. */ public static final int TEXT_DIRECTION_VISUAL_LTR = 4; /** * Bit mask for direction indicators. */ public static final int TEXT_DIRECTION_MASK = 4; /** * Letter shaping option: do not perform letter shaping. */ public static final int LETTERS_NOOP = 0; /** * Letter shaping option: replace normative letter characters in the U+0600 (Arabic) block, * by shaped ones in the U+FE70 (Presentation Forms B) block. Performs Lam-Alef ligature * substitution. */ public static final int LETTERS_SHAPE = 8; /** * Letter shaping option: replace shaped letter characters in the U+FE70 (Presentation Forms B) block * by normative ones in the U+0600 (Arabic) block. Converts Lam-Alef ligatures to pairs of Lam and * Alef characters, consuming spaces if required. */ public static final int LETTERS_UNSHAPE = 0x10; /** * Letter shaping option: replace normative letter characters in the U+0600 (Arabic) block, * except for the TASHKEEL characters at U+064B...U+0652, by shaped ones in the U+Fe70 * (Presentation Forms B) block. The TASHKEEL characters will always be converted to * the isolated forms rather than to their correct shape. */ public static final int LETTERS_SHAPE_TASHKEEL_ISOLATED = 0x18; /** * Bit mask for letter shaping options. */ public static final int LETTERS_MASK = 0x18; /** * Digit shaping option: do not perform digit shaping. */ public static final int DIGITS_NOOP = 0; /** * Digit shaping option: Replace European digits (U+0030...U+0039) by Arabic-Indic digits. */ public static final int DIGITS_EN2AN = 0x20; /** * Digit shaping option: Replace Arabic-Indic digits by European digits (U+0030...U+0039). */ public static final int DIGITS_AN2EN = 0x40; /** * Digit shaping option: * Replace European digits (U+0030...U+0039) by Arabic-Indic digits * if the most recent strongly directional character * is an Arabic letter (its Bidi direction value is RIGHT_TO_LEFT_ARABIC). * The initial state at the start of the text is assumed to be not an Arabic, * letter, so European digits at the start of the text will not change. * Compare to DIGITS_ALEN2AN_INIT_AL. */ public static final int DIGITS_EN2AN_INIT_LR = 0x60; /** * Digit shaping option: * Replace European digits (U+0030...U+0039) by Arabic-Indic digits * if the most recent strongly directional character * is an Arabic letter (its Bidi direction value is RIGHT_TO_LEFT_ARABIC). * The initial state at the start of the text is assumed to be an Arabic, * letter, so European digits at the start of the text will change. * Compare to DIGITS_ALEN2AN_INT_LR. */ public static final int DIGITS_EN2AN_INIT_AL = 0x80; /** Not a valid option value. */ private static final int DIGITS_RESERVED = 0xa0; /** * Bit mask for digit shaping options. */ public static final int DIGITS_MASK = 0xe0; /** * Digit type option: Use Arabic-Indic digits (U+0660...U+0669). */ public static final int DIGIT_TYPE_AN = 0; /** * Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). */ public static final int DIGIT_TYPE_AN_EXTENDED = 0x100; /** * Bit mask for digit type options. */ public static final int DIGIT_TYPE_MASK = 0x0100; // 0x3f00? public boolean equals(Object rhs) { return rhs != null && rhs.getClass() == ArabicShaping.class && options == ((ArabicShaping)rhs).options; } public int hashCode() { return options; } public String toString() { StringBuffer buf = new StringBuffer(super.toString()); buf.append('['); switch (options & LENGTH_MASK) { case LENGTH_GROW_SHRINK: buf.append("grow/shrink"); break; case LENGTH_FIXED_SPACES_NEAR: buf.append("spaces near"); break; case LENGTH_FIXED_SPACES_AT_END: buf.append("spaces at end"); break; case LENGTH_FIXED_SPACES_AT_BEGINNING: buf.append("spaces at beginning"); break; } switch (options & TEXT_DIRECTION_MASK) { case TEXT_DIRECTION_LOGICAL: buf.append(", logical"); break; case TEXT_DIRECTION_VISUAL_LTR: buf.append(", visual"); break; } switch (options & LETTERS_MASK) { case LETTERS_NOOP: buf.append(", no letter shaping"); break; case LETTERS_SHAPE: buf.append(", shape letters"); break; case LETTERS_SHAPE_TASHKEEL_ISOLATED: buf.append(", shape letters tashkeel isolated"); break; case LETTERS_UNSHAPE: buf.append(", unshape letters"); break; } switch (options & DIGITS_MASK) { case DIGITS_NOOP: buf.append(", no digit shaping"); break; case DIGITS_EN2AN: buf.append(", shape digits to AN"); break; case DIGITS_AN2EN: buf.append(", shape digits to EN"); break; case DIGITS_EN2AN_INIT_LR: buf.append(", shape digits to AN contextually: default EN"); break; case DIGITS_EN2AN_INIT_AL: buf.append(", shape digits to AN contextually: default AL"); break; } switch (options & DIGIT_TYPE_MASK) { case DIGIT_TYPE_AN: buf.append(", standard Arabic-Indic digits"); break; case DIGIT_TYPE_AN_EXTENDED: buf.append(", extended Arabic-Indic digits"); break; } buf.append("]"); return buf.toString(); } // // ported api // private static final int IRRELEVANT = 4; private static final int LAMTYPE = 16; private static final int ALEFTYPE = 32; private static final int LINKR = 1; private static final int LINKL = 2; private static final int LINK_MASK = 3; private static final int irrelevantPos[] = { 0x0, 0x2, 0x4, 0x6, 0x8, 0xA, 0xC, 0xE }; private static final char convertLamAlef[] = { '\u0622', // FEF5 '\u0622', // FEF6 '\u0623', // FEF7 '\u0623', // FEF8 '\u0625', // FEF9 '\u0625', // FEFA '\u0627', // FEFB '\u0627' // FEFC }; private static final char convertNormalizedLamAlef[] = { '\u0622', // 065C '\u0623', // 065D '\u0625', // 065E '\u0627', // 065F }; private static final int[] araLink = { 1 + 32 + 256 * 0x11, /*0x0622*/ 1 + 32 + 256 * 0x13, /*0x0623*/ 1 + 256 * 0x15, /*0x0624*/ 1 + 32 + 256 * 0x17, /*0x0625*/ 1 + 2 + 256 * 0x19, /*0x0626*/ 1 + 32 + 256 * 0x1D, /*0x0627*/ 1 + 2 + 256 * 0x1F, /*0x0628*/ 1 + 256 * 0x23, /*0x0629*/ 1 + 2 + 256 * 0x25, /*0x062A*/ 1 + 2 + 256 * 0x29, /*0x062B*/ 1 + 2 + 256 * 0x2D, /*0x062C*/ 1 + 2 + 256 * 0x31, /*0x062D*/ 1 + 2 + 256 * 0x35, /*0x062E*/
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -