View Javadoc

1   package net.sf.bacchus.charset;
2   
3   import java.nio.ByteBuffer;
4   import java.nio.charset.Charset;
5   import java.nio.charset.CharsetEncoder;
6   
7   /**
8    * {@link java.nio.charset.Charset} that decodes only the characters that are
9    * allowed in an ACH entry and attempts to transpose other characters to 1-byte
10   * equivalents when encoding to ACH. The transpositions replace characters found
11   * within the {@link java.lang.Character.UnicodeBlock#LATIN_1_SUPPLEMENT} and
12   * {@link java.lang.Character.UnicodeBlock#LATIN_EXTENDED_A} ranges with
13   * reasonable equivalents within the 7-bit ASCII range that is allowed within an
14   * ACH record. In most cases, the replacement is simply the unaccented version
15   * of the same character.
16   */
17  public class X_ACH_XP extends X_US_ASCII_ACH {
18  
19      /** the name of this character set. */
20      public static final String NAME = "X-ACH-X";
21  
22      /** marker for unmappable characters. */
23      private static final byte NONE = 0x00;
24  
25      /**
26       * transposition equivalents for the Unicode C1 Control Characters and
27       * Latin-1 Supplement starting at 0x0080.
28       */
29      private static final byte[] LATIN_1_SUPPLEMENT = new byte[] { NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
30              NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
31              NONE, NONE, NONE, NONE, NONE, NONE, '!', NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, '"', NONE,
32              NONE, NONE, NONE, NONE, NONE, '2', '3', NONE, NONE, NONE, NONE, NONE, NONE, NONE, '"', NONE, NONE, NONE,
33              '?', 'A', 'A', 'A', 'A', 'A', 'A', NONE, 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'D', 'N', 'O', 'O',
34              'O', 'O', 'O', NONE, 'O', 'U', 'U', 'U', 'U', 'Y', NONE, NONE, 'a', 'a', 'a', 'a', 'a', 'a', NONE, 'c',
35              'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', NONE, 'n', 'o', 'o', 'o', 'o', 'o', '/', 'o', 'u', 'u', 'u', 'u',
36              'y', NONE, 'y' };
37  
38      /**
39       * transposition equivalents for the Unicode Latin Extended-A characters
40       * starting at 0x0100.
41       */
42      private static final byte[] LATIN_EXTENDED_A = new byte[] { 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C',
43              'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g',
44              'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', NONE, NONE, 'J',
45              'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n',
46              'N', 'n', 'O', 'o', 'O', 'o', 'O', 'o', NONE, NONE, 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S',
47              's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u',
48              'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'z', 'z', 's' };
49  
50      /**
51       * Encoder that attempts to map out-of-range characters to equivalent
52       * characters that can be represented using single-byte representations
53       * within the characters valid in an ACH record. Accented characters are
54       * mapped to their non-accented form.
55       */
56      protected static class Transposer extends Encoder {
57  
58          /**
59           * Constructs an encoder for {@code X-ACH-X}.
60           * @param cs the character set this decodes.
61           */
62          public Transposer(final X_ACH_XP cs) {
63              this(cs, 1);
64          }
65  
66          /**
67           * Constructs an encoder for {@code X-ACH-X}.
68           * @param cs the character set this decodes.
69           * @param maxBytesPerChar the maximum number of bytes for a single
70           *        character.
71           */
72          protected Transposer(final Charset cs, final float maxBytesPerChar) {
73              super(cs, maxBytesPerChar);
74          }
75  
76          /**
77           * Maps characters to their single-byte equivalents.
78           * @param c {@inheritDoc}
79           * @param out {@inheritDoc}
80           * @return {@inheritDoc}
81           */
82          @Override
83          protected boolean map(final char c, final ByteBuffer out) {
84              final byte equivalent;
85              if (c > 127 && c < 256) {
86                  equivalent = LATIN_1_SUPPLEMENT[c - 128];
87              } else if (c >= 256 && c < 384) {
88                  equivalent = LATIN_EXTENDED_A[c - 256];
89              } else {
90                  equivalent = NONE;
91              }
92  
93              if (equivalent == NONE) {
94                  return false;
95              } else {
96                  out.put(equivalent);
97                  return true;
98              }
99          }
100     }
101 
102     /**
103      * Initializes a new {@code X-ACH-X} {@link Charset}.
104      */
105     public X_ACH_XP() {
106         this(NAME);
107     }
108 
109     /**
110      * Initializes a new ACH {@link Charset} with the given name and no aliases.
111      * @param name the name of the character set.
112      */
113     protected X_ACH_XP(final String name) {
114         super(name);
115     }
116 
117     /**
118      * This is a very limited {@link Charset} that is assumed only to contain
119      * itself and those character sets contained by {@link X_US_ASCII_ACH}.
120      * @param cs {@inheritDoc}
121      * @return {@inheritDoc}
122      * @see X_US_ASCII_ACH#contains(Charset)
123      */
124     @Override
125     public boolean contains(final Charset cs) {
126         return NAME.equals(cs.name()) || super.contains(cs);
127     }
128 
129     /**
130      * Delegates to {@link X_US_ASCII_ACH#newEncoder()}, which is a temporary
131      * implementation.
132      * @return {@inheritDoc}
133      * @see X_US_ASCII_ACH#newEncoder()
134      */
135     @Override
136     public CharsetEncoder newEncoder() {
137         return new Transposer(this);
138     }
139 
140 }