aixjc
作者aixjc·2011-01-27 11:40
·

IBM AS400主机档的转档问题

字数 35304阅读 10367评论 0赞 0

今天在项目中需要对AS400(IBM商用小型主机)主机档案进行转档操作,涉及到的主要是UTF-8(本地文件编码格式)与代码页(CodePage 937)之间的转换。

 

1. 读取本地文件srcFile的内容,每行补齐2048位(每行不会超过2048位,不足的以空格补齐)

2. 如果包含中文时,中文算作两位,并且考虑到400的机器,中文前后有0x0E,0x0F(shift out,shift in)控制位,即再加两位,例:“中文”算作6位

 

以下是JAVA代码:

Java代码
  1. /** 
  2.  * $Revision: 1.0 $ 
  3.  * $Date: Oct 14, 2010 $ 
  4.  * 
  5.  * Author: Ian Chan 
  6.  * Date  : Oct 14, 2010 
  7.  */  
  8. package com.test.util;  
  9.   
  10. import java.io.BufferedReader;  
  11. import java.io.File;  
  12. import java.io.FileInputStream;  
  13. import java.io.IOException;  
  14. import java.io.InputStream;  
  15. import java.io.InputStreamReader;  
  16. import java.io.UnsupportedEncodingException;  
  17. import java.util.regex.Matcher;  
  18. import java.util.regex.Pattern;  
  19.   
  20. import org.apache.commons.io.FileUtils;  
  21. import org.apache.commons.logging.Log;  
  22. import org.apache.commons.logging.LogFactory;  
  23.   
  24. import com.test.convert.Converter;  
  25.   
  26. /** 
  27.  * @author Ian Chan 
  28.  * @version 1.0 
  29.  */  
  30. public class ConverterHelper {  
  31.   
  32.     private static final Log logger = LogFactory.getLog(ConverterHelper.class);  
  33.   
  34.     private static final int LINE_LEN = 2048;  
  35.   
  36.     private static final String SPACE = " ";  
  37.   
  38.     public static String buildContent(File file) {  
  39.         StringBuffer buffer = new StringBuffer();  
  40.         InputStream is = null;  
  41.         BufferedReader reader;  
  42.         try {  
  43.             is = new FileInputStream(file);  
  44.             reader = new BufferedReader(new InputStreamReader(is));  
  45.             String line;  
  46.             while ((line = reader.readLine()) != null) {  
  47.                 if (line != null && line.length() < LINE_LEN) {  
  48.                     StrLength sl = new ConverterHelper().new StrLength(line, line.length());//InnerClass調用  
  49.                     handleLine(sl);  
  50.                     line = sl.line;  
  51.                     int length = sl.length;  
  52.                     int remain = LINE_LEN - length;  
  53.                     for (int i = 0; i < remain; i++) {  
  54.                         line += SPACE;//補齊空格  
  55.                     }  
  56.                     buffer.append(line);  
  57.                     System.out.println("len:" + line.length() + ",line:" + line);  
  58.                 }  
  59.             }  
  60.         } catch (IOException e) {  
  61.             logger.error(e.getMessage(), e);  
  62.         } finally {  
  63.             if (is != null)  
  64.                 try {  
  65.                     is.close();  
  66.                 } catch (IOException e) {  
  67.                     logger.error(e.getMessage(), e);  
  68.                 }  
  69.         }  
  70.         return buffer.toString();  
  71.     }  
  72.   
  73.     private static void handleLine(StrLength strLength) throws UnsupportedEncodingException {  
  74.         String regex = "([\u4e00-\u9fa5]|[ ])+";//判斷中文的正則表達式(包括中文全角空格)  
  75.         Pattern pattern = Pattern.compile(regex);  
  76.         Matcher matcher = pattern.matcher(strLength.line);  
  77.         if (!matcher.find())  
  78.             return;  
  79.         StringBuffer buffer = new StringBuffer();  
  80.         int lineLength = 0;  
  81.         boolean exist = true;  
  82.         int index = 0;  
  83.         while (exist) {  
  84.             int start = matcher.start();//匹配第一個  
  85.             int end = matcher.end();//匹配最後一個  
  86.             String beforeStart = strLength.line.substring(index, start);  
  87.             buffer.append(beforeStart);  
  88.             lineLength += beforeStart.length();  
  89.   
  90.             String cnchars = strLength.line.substring(start, end);  
  91.             buffer.append(cnchars);  
  92.             lineLength += cnchars.length() * 2 + 2;// 中文字算兩位,前後控制符(0E,0F)各算一位  
  93.   
  94.             index = end;  
  95.             exist = matcher.find(end + 1);//查找下一個中文  
  96.         }  
  97.         String remainLine = strLength.line.substring(index);  
  98.         buffer.append(remainLine);  
  99.         lineLength += remainLine.length();  
  100.   
  101.         strLength.line = buffer.toString();  
  102.         strLength.length = lineLength;  
  103.     }  
  104.   
  105.     public static void convert(String srcFileName, String destFileName) {  
  106.         try {  
  107.             String as400content = Converter.as400Encode(buildContent(new File(srcFileName)));  
  108.             FileUtils.writeStringToFile(new File(destFileName), as400content, Converter.ENCODING);  
  109.   
  110.             System.out.println(as400content);  
  111.         } catch (IOException e) {  
  112.             logger.error(e.getMessage(), e);  
  113.         }  
  114.     }  
  115.   
  116.     class StrLength {  
  117.   
  118.         private String line;  
  119.   
  120.         private int length;  
  121.   
  122.         private StrLength(String line, int length) {  
  123.             this.line = line;  
  124.             this.length = length;  
  125.         }  
  126.   
  127.         public String getLine() {  
  128.             return line;  
  129.         }  
  130.   
  131.         public void setLine(String line) {  
  132.             this.line = line;  
  133.         }  
  134.   
  135.         public int getLength() {  
  136.             return length;  
  137.         }  
  138.   
  139.         public void setLength(int length) {  
  140.             this.length = length;  
  141.         }  
  142.   
  143.     }  
  144.   
  145.     public static void main(String[] args) throws Exception {  
  146.         String srcFileName = "F:\AS400_CN";  
  147.         String destFileName = "F:\AS400_CN_CP937";  
  148.         convert(srcFileName, destFileName);  
  149.         System.out.println("Convert Finished");  
  150.   
  151.     }  
  152. }  
/** * $Revision: 1.0 $ * $Date: Oct 14, 2010 $ * * Author: Ian Chan * Date : Oct 14, 2010 */ package com.test.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.test.convert.Converter; /** * @author Ian Chan * @version 1.0 */ public class ConverterHelper { private static final Log logger = LogFactory.getLog(ConverterHelper.class); private static final int LINE_LEN = 2048; private static final String SPACE = " "; public static String buildContent(File file) { StringBuffer buffer = new StringBuffer(); InputStream is = null; BufferedReader reader; try { is = new FileInputStream(file); reader = new BufferedReader(new InputStreamReader(is)); String line; while ((line = reader.readLine()) != null) { if (line != null && line.length() < LINE_LEN) { StrLength sl = new ConverterHelper().new StrLength(line, line.length());//InnerClass調用 handleLine(sl); line = sl.line; int length = sl.length; int remain = LINE_LEN - length; for (int i = 0; i < remain; i++) { line += SPACE;//補齊空格 } buffer.append(line); System.out.println("len:" + line.length() + ",line:" + line); } } } catch (IOException e) { logger.error(e.getMessage(), e); } finally { if (is != null) try { is.close(); } catch (IOException e) { logger.error(e.getMessage(), e); } } return buffer.toString(); } private static void handleLine(StrLength strLength) throws UnsupportedEncodingException { String regex = "([\u4e00-\u9fa5]|[ ])+";//判斷中文的正則表達式(包括中文全角空格) Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(strLength.line); if (!matcher.find()) return; StringBuffer buffer = new StringBuffer(); int lineLength = 0; boolean exist = true; int index = 0; while (exist) { int start = matcher.start();//匹配第一個 int end = matcher.end();//匹配最後一個 String beforeStart = strLength.line.substring(index, start); buffer.append(beforeStart); lineLength += beforeStart.length(); String cnchars = strLength.line.substring(start, end); buffer.append(cnchars); lineLength += cnchars.length() * 2 + 2;// 中文字算兩位,前後控制符(0E,0F)各算一位 index = end; exist = matcher.find(end + 1);//查找下一個中文 } String remainLine = strLength.line.substring(index); buffer.append(remainLine); lineLength += remainLine.length(); strLength.line = buffer.toString(); strLength.length = lineLength; } public static void convert(String srcFileName, String destFileName) { try { String as400content = Converter.as400Encode(buildContent(new File(srcFileName))); FileUtils.writeStringToFile(new File(destFileName), as400content, Converter.ENCODING); System.out.println(as400content); } catch (IOException e) { logger.error(e.getMessage(), e); } } class StrLength { private String line; private int length; private StrLength(String line, int length) { this.line = line; this.length = length; } public String getLine() { return line; } public void setLine(String line) { this.line = line; } public int getLength() { return length; } public void setLength(int length) { this.length = length; } } public static void main(String[] args) throws Exception { String srcFileName = "F:\AS400_CN"; String destFileName = "F:\AS400_CN_CP937"; convert(srcFileName, destFileName); System.out.println("Convert Finished"); } }

 对于代码页(CodePage)的内容,参考如下:

Html代码
  1. Conversion between any of the following codepages is provided.  
  2.   
  3. 37 (=x0025) EBCDIC US English  
  4. 273 (=x0111) EBCDIC German  
  5. 277 (=x0115) EBCDIC Danish/Norwegian  
  6. 278 (=x0116) EBCDIC Finnish/Swedish  
  7. 280 (=x0118) EBCDIC Italian  
  8. 284 (=x011C) EBCDIC Spanish  
  9. 285 (=x011D) EBCDIC UK English  
  10. 297 (=x0129) EBCDIC French  
  11. 300 (=x012C) EBCDIC Japanese DBCS  
  12. 301 (=x012D) Japanese PC DBCS  
  13. 420 (=x01A4) EBCDIC Arabic  
  14. 424 (=x01A8) EBCDIC Arabic  
  15. 437 (=x01B5) PC-ASCII US  
  16. 500 (=x01F4) EBCDIC International  
  17. 803 (=x0323) Hebrew Set A  
  18. 813 (=x032D) ISO8859-7 Greek  
  19. 819 (=x0333) ISO8859-1 Western European  
  20. 833 (=x0341) IBM-833: Korean  
  21. 834 (=x0342) IBM-834: Korean Host DBCS  
  22. 835 (=x0343) EBCDIC Traditional Chinese DBCS  
  23. 836 (=x0344) EBCDIC Simplified Chinese SBCS  
  24. 838 (=x0346) EBCDIC Thai SBCS  
  25. 850 (=x0352) ISO8859-1 Western European  
  26. 852 (=x0354) PC-ASCII Eastern European  
  27. 855 (=x0357) PC-ASCII Cyrillic  
  28. 856 (=x0358) PC-ASCII Hebrew  
  29. 857 (=x0359) PC-ASCII Turkish  
  30. 858 (=x035A) PC-ASCII Western European with Euro  
  31. 860 (=x035C) PC-ASCII Portuguese  
  32. 861 (=x035D) PC-ASCII Icelandic  
  33. 862 (=x035E) PC-ASCII Hebrew  
  34. 863 (=x035F) PC-ASCII Canadian French  
  35. 864 (=x0360) PC-ASCII Arabic  
  36. 865 (=x0361) PC-ASCII Scandinavian  
  37. 866 (=x0362) PC-ASCII Cyrillic #2  
  38. 868 (=x0364) PC-ASCII Urdu  
  39. 869 (=x0365) PC-ASCII Greek  
  40. 870 (=x0366) EBCDIC Eastern Europe  
  41. 871 (=x0367) EBCDIC Icelandic  
  42. 872 (=x0368) PC-ASCII Cyrillic with Euro  
  43. 874 (=x036A) PC-ASCII Thai SBCS  
  44. 875 (=x036B) EBCDIC Greek  
  45. 880 (=x0370) EBCDIC Cyrillic  
  46. 891 (=x037B) IBM-891: Korean  
  47. 897 (=x0381) PC-ASCII Japan Data SBCS  
  48. 903 (=x0387) PC Simplified Chinese SBCS  
  49. 904 (=x0388) PC Traditional Chinese Data - SBCS  
  50. 912 (=x0390) ISO8859-2 Eastern European  
  51. 915 (=x0393) ISO8859-5 Cyrillic  
  52. 916 (=x0394) ISO8859-8 Hebrew  
  53. 918 (=x0396) EBCDIC Urdu  
  54. 920 (=x0398) ISO8859-9 Turkish  
  55. 921 (=x0399) ISO Baltic  
  56. 922 (=x039A) ISO Estonian  
  57. 923 (=x039B) ISO8859-15 Western Europe with euro (Latin 9)  
  58. 924 (=x039C) EBCDIC Western Europe with euro  
  59. 927 (=x039F) PC Traditional Chinese DBCS  
  60. 928 (=x03A0) PC Simplified Chinese DBCS  
  61. 930 (=x03A2) EBCDIC Japanese Katakana/Kanji mixed  
  62. 932 (=x03A4) Japanese OS/2  
  63. 933 (=x03A5) EBCDIC Korean Mixed  
  64. 935 (=x03A7) EBCDIC Simplified Chinese Mixed  
  65. 937 (=x03A9) EBCDIC Traditional Chinese Mixed  
  66. 939 (=x03AB) EBCDIC Japanese Latin/Kanji mixed  
  67. 941 (=x03AD) Japanese PC DBCS - for open systems  
  68. 942 (=x03AE) Japanese PC Data Mixed - extended SBCS  
  69. 943 (=x03AF) Japanese PC Mixed - for open systems  
  70. 944 (=x03BO) Korean PC data Mixed - extended SBCS  
  71. 946 (=x03B2) Simplified Chinese PC data Mixed - extended SBCS  
  72. 947 (=x03B3) PC Traditional Chinese DBCS  
  73. 948 (=x03B4) PC Traditional Chinese Mixed - extended SBCS  
  74. 949 (=x03B5) PC Korean Mixed - KS code  
  75. 950 (=x03B6) PC Traditional Chinese Mixed - big5  
  76. 951 (=x03B7) PC Korean DBCS - KS code  
  77. 970 (=x03CA) euc Korean  
  78. 1004 (=x03EC) PC Data Latin1  
  79. 1006 (=x03EE) ISO Urdu  
  80. 1008 (=x03F0) ASCII Arabic 8-bit ISO  
  81. 1025 (=x0401) EBCDIC Cyrillic  
  82. 1026 (=x0402) EBCDIC Turkish  
  83. 1027 (=x0403) EBCDIC Japanese Latin  
  84. 1040 (=x0410) IBM-1040: Korean  
  85. 1041 (=x0411) Japanese PC - extended SBCS  
  86. 1042 (=x0412) PC Simplified Chinese - extended SBCS  
  87. 1043 (=x0413) PC Traditional Chinese - extended SBCS  
  88. 1046 (=x0416) PC-ASCII Arabic  
  89. 1047 (=x0417) IBM-1047: Western European  
  90. 1051 (=x041B) ASCII roman8 for HP Western European  
  91. 1088 (=x0440) PC Korean SBCS - KS code  
  92. 1089 (=x0441) ISO8859-6 Arabic  
  93. 1097 (=x0449) EBCDIC Farsi  
  94. 1098 (=x044A) PC-ASCII Farsi  
  95. 1112 (=x0458) EBCDIC Baltic (Latvian/Lithuanian)  
  96. 1114 (=x045A) PC Traditional Chinese - big 5 SBCS  
  97. 1115 (=x045B) PC Simplified Chinese SBCS  
  98. 1122 (=x0462) EBCDIC Estonian  
  99. 1123 (=x0463) EBCDIC Ukrainian  
  100. 1124 (=x0464) UNIX-ASCII Ukrainian  
  101. 1131 (=x046B) PC-ASCII Belarus  
  102. 1140 (=x0474) EBCDIC USA, with euro (like 037)  
  103. 1141 (=x0475) EBCDIC Austria, Germany, with euro (like 273)  
  104. 1142 (=x0476) EBCDIC Denmark, Norway, with euro (like 277)  
  105. 1143 (=x0477) EBCDIC Finland, Sweden, with euro (like 278)  
  106. 1144 (=x0478) EBCDIC Italy, with euro (like 280)  
  107. 1145 (=x0479) EBCDIC Spain, with euro (like 284)  
  108. 1146 (=x047A) EBCDIC UK, with euro (like 285)  
  109. 1147 (=x047B) EBCDIC France, with euro (like 297)  
  110. 1148 (=x047C) EBCDIC International, with euro (like 500)  
  111. 1149 (=x047D) EBCDIC Iceland, with euro (like 871)  
  112. 1200 (=x04B0) Unicode - UCS-2  
  113. 1208 (=x04B8) Unicode - UTF-8  
  114. 1250 (=x04E2) Windows - Eastern European  
  115. 1251 (=x04E3) Windows - Cyrillic  
  116. 1252 (=x04E4) Windows - Western European  
  117. 1253 (=x04E5) Windows - Greek  
  118. 1254 (=x04E6) Windows - Turkish  
  119. 1255 (=x04E7) Windows - Hebrew  
  120. 1256 (=x04E8) Windows - Arabic  
  121. 1257 (=x04E9) Windows - Baltic Rim  
  122. 1275 (=x04FB) Apple - Western European  
  123. 1280 (=x0500) Apple - Greek  
  124. 1281 (=x0501) Apple - Turkish  
  125. 1282 (=x0502) Apple - Eastern European  
  126. 1283 (=x0503) Apple - Cyrillic  
  127. 1284 (=x0504) IBM-504: Eastern European  
  128. 1285 (=x0505) IBM-505: Eastern European  
  129. 1363 (=x0553) Windows Korean PC Mixed including 11,172 full hangul  
  130. 1364 (=x0554) Korean Host Mixed extended including 11,172 full hangul  
  131. 1380 (=x0564) PC Simplified Chinese DBCS  
  132. 1381 (=x0565) PC Simplified Chinese Mixed  
  133. 1383 (=x0567) euc Simplified Chinese Mixed  
  134. 1386 (=x056A) PC Simplified Chinese Data GBK Mixed  
  135. 1388 (=x056C) DBCS Host Simplified Chinese Data GBK Mixed  
  136. 5346 (=x14E2) Windows-Eastern European with Euro (like 1250)  
  137. 5347 (=x14E3) Windows - Cyrillic with Euro (like 1251)  
  138. 5348 (=x14E4) Windows-Western European with Euro (like 1252)  
  139. 5349 (=x14E5) Windows-Windows - Greek with Euro (like 1253)  
  140. 5350 (=x14E6) Windows - Turkish with Euro (like 1254)  
  141. 5351 (=x14E7) Windows - Hebrew with Euro (like 1255)  
  142. 5352 (=x14E8) Windows - Arabic with Euro (like 1256)  
  143. 5353 (=x14E9) Windows - Baltic Rim with Euro (like 1257)  
  144. 5354 (=x14EA) 'Windows - Vietnamese with Euro (like 1258)   
Conversion between any of the following codepages is provided. 37 (=x0025) EBCDIC US English 273 (=x0111) EBCDIC German 277 (=x0115) EBCDIC Danish/Norwegian 278 (=x0116) EBCDIC Finnish/Swedish 280 (=x0118) EBCDIC Italian 284 (=x011C) EBCDIC Spanish 285 (=x011D) EBCDIC UK English 297 (=x0129) EBCDIC French 300 (=x012C) EBCDIC Japanese DBCS 301 (=x012D) Japanese PC DBCS 420 (=x01A4) EBCDIC Arabic 424 (=x01A8) EBCDIC Arabic 437 (=x01B5) PC-ASCII US 500 (=x01F4) EBCDIC International 803 (=x0323) Hebrew Set A 813 (=x032D) ISO8859-7 Greek 819 (=x0333) ISO8859-1 Western European 833 (=x0341) IBM-833: Korean 834 (=x0342) IBM-834: Korean Host DBCS 835 (=x0343) EBCDIC Traditional Chinese DBCS 836 (=x0344) EBCDIC Simplified Chinese SBCS 838 (=x0346) EBCDIC Thai SBCS 850 (=x0352) ISO8859-1 Western European 852 (=x0354) PC-ASCII Eastern European 855 (=x0357) PC-ASCII Cyrillic 856 (=x0358) PC-ASCII Hebrew 857 (=x0359) PC-ASCII Turkish 858 (=x035A) PC-ASCII Western European with Euro 860 (=x035C) PC-ASCII Portuguese 861 (=x035D) PC-ASCII Icelandic 862 (=x035E) PC-ASCII Hebrew 863 (=x035F) PC-ASCII Canadian French 864 (=x0360) PC-ASCII Arabic 865 (=x0361) PC-ASCII Scandinavian 866 (=x0362) PC-ASCII Cyrillic #2 868 (=x0364) PC-ASCII Urdu 869 (=x0365) PC-ASCII Greek 870 (=x0366) EBCDIC Eastern Europe 871 (=x0367) EBCDIC Icelandic 872 (=x0368) PC-ASCII Cyrillic with Euro 874 (=x036A) PC-ASCII Thai SBCS 875 (=x036B) EBCDIC Greek 880 (=x0370) EBCDIC Cyrillic 891 (=x037B) IBM-891: Korean 897 (=x0381) PC-ASCII Japan Data SBCS 903 (=x0387) PC Simplified Chinese SBCS 904 (=x0388) PC Traditional Chinese Data - SBCS 912 (=x0390) ISO8859-2 Eastern European 915 (=x0393) ISO8859-5 Cyrillic 916 (=x0394) ISO8859-8 Hebrew 918 (=x0396) EBCDIC Urdu 920 (=x0398) ISO8859-9 Turkish 921 (=x0399) ISO Baltic 922 (=x039A) ISO Estonian 923 (=x039B) ISO8859-15 Western Europe with euro (Latin 9) 924 (=x039C) EBCDIC Western Europe with euro 927 (=x039F) PC Traditional Chinese DBCS 928 (=x03A0) PC Simplified Chinese DBCS 930 (=x03A2) EBCDIC Japanese Katakana/Kanji mixed 932 (=x03A4) Japanese OS/2 933 (=x03A5) EBCDIC Korean Mixed 935 (=x03A7) EBCDIC Simplified Chinese Mixed 937 (=x03A9) EBCDIC Traditional Chinese Mixed 939 (=x03AB) EBCDIC Japanese Latin/Kanji mixed 941 (=x03AD) Japanese PC DBCS - for open systems 942 (=x03AE) Japanese PC Data Mixed - extended SBCS 943 (=x03AF) Japanese PC Mixed - for open systems 944 (=x03BO) Korean PC data Mixed - extended SBCS 946 (=x03B2) Simplified Chinese PC data Mixed - extended SBCS 947 (=x03B3) PC Traditional Chinese DBCS 948 (=x03B4) PC Traditional Chinese Mixed - extended SBCS 949 (=x03B5) PC Korean Mixed - KS code 950 (=x03B6) PC Traditional Chinese Mixed - big5 951 (=x03B7) PC Korean DBCS - KS code 970 (=x03CA) euc Korean 1004 (=x03EC) PC Data Latin1 1006 (=x03EE) ISO Urdu 1008 (=x03F0) ASCII Arabic 8-bit ISO 1025 (=x0401) EBCDIC Cyrillic 1026 (=x0402) EBCDIC Turkish 1027 (=x0403) EBCDIC Japanese Latin 1040 (=x0410) IBM-1040: Korean 1041 (=x0411) Japanese PC - extended SBCS 1042 (=x0412) PC Simplified Chinese - extended SBCS 1043 (=x0413) PC Traditional Chinese - extended SBCS 1046 (=x0416) PC-ASCII Arabic 1047 (=x0417) IBM-1047: Western European 1051 (=x041B) ASCII roman8 for HP Western European 1088 (=x0440) PC Korean SBCS - KS code 1089 (=x0441) ISO8859-6 Arabic 1097 (=x0449) EBCDIC Farsi 1098 (=x044A) PC-ASCII Farsi 1112 (=x0458) EBCDIC Baltic (Latvian/Lithuanian) 1114 (=x045A) PC Traditional Chinese - big 5 SBCS 1115 (=x045B) PC Simplified Chinese SBCS 1122 (=x0462) EBCDIC Estonian 1123 (=x0463) EBCDIC Ukrainian 1124 (=x0464) UNIX-ASCII Ukrainian 1131 (=x046B) PC-ASCII Belarus 1140 (=x0474) EBCDIC USA, with euro (like 037) 1141 (=x0475) EBCDIC Austria, Germany, with euro (like 273) 1142 (=x0476) EBCDIC Denmark, Norway, with euro (like 277) 1143 (=x0477) EBCDIC Finland, Sweden, with euro (like 278) 1144 (=x0478) EBCDIC Italy, with euro (like 280) 1145 (=x0479) EBCDIC Spain, with euro (like 284) 1146 (=x047A) EBCDIC UK, with euro (like 285) 1147 (=x047B) EBCDIC France, with euro (like 297) 1148 (=x047C) EBCDIC International, with euro (like 500) 1149 (=x047D) EBCDIC Iceland, with euro (like 871) 1200 (=x04B0) Unicode - UCS-2 1208 (=x04B8) Unicode - UTF-8 1250 (=x04E2) Windows - Eastern European 1251 (=x04E3) Windows - Cyrillic 1252 (=x04E4) Windows - Western European 1253 (=x04E5) Windows - Greek 1254 (=x04E6) Windows - Turkish 1255 (=x04E7) Windows - Hebrew 1256 (=x04E8) Windows - Arabic 1257 (=x04E9) Windows - Baltic Rim 1275 (=x04FB) Apple - Western European 1280 (=x0500) Apple - Greek 1281 (=x0501) Apple - Turkish 1282 (=x0502) Apple - Eastern European 1283 (=x0503) Apple - Cyrillic 1284 (=x0504) IBM-504: Eastern European 1285 (=x0505) IBM-505: Eastern European 1363 (=x0553) Windows Korean PC Mixed including 11,172 full hangul 1364 (=x0554) Korean Host Mixed extended including 11,172 full hangul 1380 (=x0564) PC Simplified Chinese DBCS 1381 (=x0565) PC Simplified Chinese Mixed 1383 (=x0567) euc Simplified Chinese Mixed 1386 (=x056A) PC Simplified Chinese Data GBK Mixed 1388 (=x056C) DBCS Host Simplified Chinese Data GBK Mixed 5346 (=x14E2) Windows-Eastern European with Euro (like 1250) 5347 (=x14E3) Windows - Cyrillic with Euro (like 1251) 5348 (=x14E4) Windows-Western European with Euro (like 1252) 5349 (=x14E5) Windows-Windows - Greek with Euro (like 1253) 5350 (=x14E6) Windows - Turkish with Euro (like 1254) 5351 (=x14E7) Windows - Hebrew with Euro (like 1255) 5352 (=x14E8) Windows - Arabic with Euro (like 1256) 5353 (=x14E9) Windows - Baltic Rim with Euro (like 1257) 5354 (=x14EA) 'Windows - Vietnamese with Euro (like 1258)

 

 

维基百科 写道
Shift Out and Shift In characters
From Wikipedia, the free encyclopedia
Jump to: navigation, search
Shift Out (SO) and Shift In (SI) are ASCII control characters 14 and 15, respectively (0xE and 0xF). The original meaning of those characters was to switch to a different character set and back. This was used, for instance, in the Russian character set known as KOI7, where SO starts printing Russian letters, and SI starts printing Latin letters again. SO/SI control characters also are used to display VT-100 pseudographics. ISO/IEC 2022 standard specifies their generalized usage.

Some older printers used these characters to control special features, such as changing the font or ink color. On the Model 38 Teletype, SO switched to red printing while SI switched back to black printing.

 

 -- 2010-10-22更新

今天测试又有点问题,原来是中文全角空格的问题,中文全角空格不包括在"\u4e00-\u9fa5"内,所以把检验中文的正则表达式改成了

Java代码
  1. ([\u4e00-\u9fa5]|[ ])+  

如果觉得我的文章对您有用,请点赞。您的支持将鼓励我继续创作!

0

添加新评论0 条评论

Ctrl+Enter 发表

作者其他文章

X社区推广