unicode.c 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. /*
  2. * unicode.c
  3. *
  4. * PURPOSE
  5. * Routines for converting between UTF-8 and OSTA Compressed Unicode.
  6. * Also handles filename mangling
  7. *
  8. * DESCRIPTION
  9. * OSTA Compressed Unicode is explained in the OSTA UDF specification.
  10. * http://www.osta.org/
  11. * UTF-8 is explained in the IETF RFC XXXX.
  12. * ftp://ftp.internic.net/rfc/rfcxxxx.txt
  13. *
  14. * COPYRIGHT
  15. * This file is distributed under the terms of the GNU General Public
  16. * License (GPL). Copies of the GPL can be obtained from:
  17. * ftp://prep.ai.mit.edu/pub/gnu/GPL
  18. * Each contributing author retains all rights to their own work.
  19. */
  20. #include "udfdecl.h"
  21. #include <linux/kernel.h>
  22. #include <linux/string.h> /* for memset */
  23. #include <linux/nls.h>
  24. #include <linux/crc-itu-t.h>
  25. #include <linux/slab.h>
  26. #include "udf_sb.h"
  27. static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
  28. int);
  29. static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
  30. {
  31. if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN))
  32. return 0;
  33. memset(dest, 0, sizeof(struct ustr));
  34. memcpy(dest->u_name, src, strlen);
  35. dest->u_cmpID = 0x08;
  36. dest->u_len = strlen;
  37. return strlen;
  38. }
  39. /*
  40. * udf_build_ustr
  41. */
  42. int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
  43. {
  44. int usesize;
  45. if (!dest || !ptr || !size)
  46. return -1;
  47. BUG_ON(size < 2);
  48. usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
  49. usesize = min(usesize, size - 2);
  50. dest->u_cmpID = ptr[0];
  51. dest->u_len = usesize;
  52. memcpy(dest->u_name, ptr + 1, usesize);
  53. memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
  54. return 0;
  55. }
  56. /*
  57. * udf_build_ustr_exact
  58. */
  59. static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
  60. {
  61. memset(dest, 0, sizeof(struct ustr));
  62. dest->u_cmpID = ptr[0];
  63. dest->u_len = exactsize - 1;
  64. memcpy(dest->u_name, ptr + 1, exactsize - 1);
  65. }
  66. static int udf_uni2char_utf8(wchar_t uni,
  67. unsigned char *out,
  68. int boundlen)
  69. {
  70. int u_len = 0;
  71. if (boundlen <= 0)
  72. return -ENAMETOOLONG;
  73. if (uni < 0x80) {
  74. out[u_len++] = (unsigned char)uni;
  75. } else if (uni < 0x800) {
  76. if (boundlen < 2)
  77. return -ENAMETOOLONG;
  78. out[u_len++] = (unsigned char)(0xc0 | (uni >> 6));
  79. out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
  80. } else {
  81. if (boundlen < 3)
  82. return -ENAMETOOLONG;
  83. out[u_len++] = (unsigned char)(0xe0 | (uni >> 12));
  84. out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f));
  85. out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
  86. }
  87. return u_len;
  88. }
  89. static int udf_char2uni_utf8(const unsigned char *in,
  90. int boundlen,
  91. wchar_t *uni)
  92. {
  93. unsigned int utf_char;
  94. unsigned char c;
  95. int utf_cnt, u_len;
  96. utf_char = 0;
  97. utf_cnt = 0;
  98. for (u_len = 0; u_len < boundlen;) {
  99. c = in[u_len++];
  100. /* Complete a multi-byte UTF-8 character */
  101. if (utf_cnt) {
  102. utf_char = (utf_char << 6) | (c & 0x3f);
  103. if (--utf_cnt)
  104. continue;
  105. } else {
  106. /* Check for a multi-byte UTF-8 character */
  107. if (c & 0x80) {
  108. /* Start a multi-byte UTF-8 character */
  109. if ((c & 0xe0) == 0xc0) {
  110. utf_char = c & 0x1f;
  111. utf_cnt = 1;
  112. } else if ((c & 0xf0) == 0xe0) {
  113. utf_char = c & 0x0f;
  114. utf_cnt = 2;
  115. } else if ((c & 0xf8) == 0xf0) {
  116. utf_char = c & 0x07;
  117. utf_cnt = 3;
  118. } else if ((c & 0xfc) == 0xf8) {
  119. utf_char = c & 0x03;
  120. utf_cnt = 4;
  121. } else if ((c & 0xfe) == 0xfc) {
  122. utf_char = c & 0x01;
  123. utf_cnt = 5;
  124. } else {
  125. utf_cnt = -1;
  126. break;
  127. }
  128. continue;
  129. } else {
  130. /* Single byte UTF-8 character (most common) */
  131. utf_char = c;
  132. }
  133. }
  134. *uni = utf_char;
  135. break;
  136. }
  137. if (utf_cnt) {
  138. *uni = '?';
  139. return -EINVAL;
  140. }
  141. return u_len;
  142. }
  143. static int udf_name_from_CS0(struct ustr *utf_o,
  144. const struct ustr *ocu_i,
  145. int (*conv_f)(wchar_t, unsigned char *, int))
  146. {
  147. const uint8_t *ocu;
  148. uint8_t cmp_id, ocu_len;
  149. int i, len;
  150. ocu_len = ocu_i->u_len;
  151. if (ocu_len == 0) {
  152. memset(utf_o, 0, sizeof(struct ustr));
  153. return 0;
  154. }
  155. cmp_id = ocu_i->u_cmpID;
  156. if (cmp_id != 8 && cmp_id != 16) {
  157. memset(utf_o, 0, sizeof(struct ustr));
  158. pr_err("unknown compression code (%d) stri=%s\n",
  159. cmp_id, ocu_i->u_name);
  160. return -EINVAL;
  161. }
  162. ocu = ocu_i->u_name;
  163. utf_o->u_len = 0;
  164. for (i = 0; (i < ocu_len) && (utf_o->u_len < UDF_NAME_LEN);) {
  165. /* Expand OSTA compressed Unicode to Unicode */
  166. uint32_t c = ocu[i++];
  167. if (cmp_id == 16)
  168. c = (c << 8) | ocu[i++];
  169. len = conv_f(c, &utf_o->u_name[utf_o->u_len],
  170. UDF_NAME_LEN - utf_o->u_len);
  171. /* Valid character? */
  172. if (len >= 0)
  173. utf_o->u_len += len;
  174. else if (len == -ENAMETOOLONG)
  175. break;
  176. else
  177. utf_o->u_name[utf_o->u_len++] = '?';
  178. }
  179. utf_o->u_cmpID = 8;
  180. return utf_o->u_len;
  181. }
  182. static int udf_name_to_CS0(dstring *ocu, struct ustr *uni, int length,
  183. int (*conv_f)(const unsigned char *, int, wchar_t *))
  184. {
  185. int i, len;
  186. unsigned int max_val;
  187. wchar_t uni_char;
  188. int u_len, u_ch;
  189. memset(ocu, 0, sizeof(dstring) * length);
  190. ocu[0] = 8;
  191. max_val = 0xff;
  192. u_ch = 1;
  193. try_again:
  194. u_len = 0;
  195. for (i = 0; i < uni->u_len; i++) {
  196. /* Name didn't fit? */
  197. if (u_len + 1 + u_ch >= length)
  198. return 0;
  199. len = conv_f(&uni->u_name[i], uni->u_len - i, &uni_char);
  200. if (!len)
  201. continue;
  202. /* Invalid character, deal with it */
  203. if (len < 0) {
  204. len = 1;
  205. uni_char = '?';
  206. }
  207. if (uni_char > max_val) {
  208. max_val = 0xffff;
  209. ocu[0] = 0x10;
  210. u_ch = 2;
  211. goto try_again;
  212. }
  213. if (max_val == 0xffff)
  214. ocu[++u_len] = (uint8_t)(uni_char >> 8);
  215. ocu[++u_len] = (uint8_t)(uni_char & 0xff);
  216. i += len - 1;
  217. }
  218. ocu[length - 1] = (uint8_t)u_len + 1;
  219. return u_len + 1;
  220. }
  221. int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
  222. {
  223. return udf_name_from_CS0(utf_o, ocu_i, udf_uni2char_utf8);
  224. }
  225. int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
  226. uint8_t *dname, int dlen)
  227. {
  228. struct ustr *filename, *unifilename;
  229. int (*conv_f)(wchar_t, unsigned char *, int);
  230. int ret;
  231. if (!slen)
  232. return -EIO;
  233. filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
  234. if (!filename)
  235. return -ENOMEM;
  236. unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
  237. if (!unifilename) {
  238. ret = -ENOMEM;
  239. goto out1;
  240. }
  241. udf_build_ustr_exact(unifilename, sname, slen);
  242. if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
  243. conv_f = udf_uni2char_utf8;
  244. } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
  245. conv_f = UDF_SB(sb)->s_nls_map->uni2char;
  246. } else
  247. BUG();
  248. ret = udf_name_from_CS0(filename, unifilename, conv_f);
  249. if (ret < 0) {
  250. udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
  251. goto out2;
  252. }
  253. ret = udf_translate_to_linux(dname, dlen,
  254. filename->u_name, filename->u_len,
  255. unifilename->u_name, unifilename->u_len);
  256. /* Zero length filename isn't valid... */
  257. if (ret == 0)
  258. ret = -EINVAL;
  259. out2:
  260. kfree(unifilename);
  261. out1:
  262. kfree(filename);
  263. return ret;
  264. }
  265. int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
  266. uint8_t *dname, int dlen)
  267. {
  268. struct ustr unifilename;
  269. int (*conv_f)(const unsigned char *, int, wchar_t *);
  270. if (!udf_char_to_ustr(&unifilename, sname, slen))
  271. return 0;
  272. if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
  273. conv_f = udf_char2uni_utf8;
  274. } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
  275. conv_f = UDF_SB(sb)->s_nls_map->char2uni;
  276. } else
  277. BUG();
  278. return udf_name_to_CS0(dname, &unifilename, dlen, conv_f);
  279. }
  280. #define ILLEGAL_CHAR_MARK '_'
  281. #define EXT_MARK '.'
  282. #define CRC_MARK '#'
  283. #define EXT_SIZE 5
  284. /* Number of chars we need to store generated CRC to make filename unique */
  285. #define CRC_LEN 5
  286. static int udf_translate_to_linux(uint8_t *newName, int newLen,
  287. uint8_t *udfName, int udfLen,
  288. uint8_t *fidName, int fidNameLen)
  289. {
  290. int index, newIndex = 0, needsCRC = 0;
  291. int extIndex = 0, newExtIndex = 0, hasExt = 0;
  292. unsigned short valueCRC;
  293. uint8_t curr;
  294. if (udfName[0] == '.' &&
  295. (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
  296. needsCRC = 1;
  297. newIndex = udfLen;
  298. memcpy(newName, udfName, udfLen);
  299. } else {
  300. for (index = 0; index < udfLen; index++) {
  301. curr = udfName[index];
  302. if (curr == '/' || curr == 0) {
  303. needsCRC = 1;
  304. curr = ILLEGAL_CHAR_MARK;
  305. while (index + 1 < udfLen &&
  306. (udfName[index + 1] == '/' ||
  307. udfName[index + 1] == 0))
  308. index++;
  309. }
  310. if (curr == EXT_MARK &&
  311. (udfLen - index - 1) <= EXT_SIZE) {
  312. if (udfLen == index + 1)
  313. hasExt = 0;
  314. else {
  315. hasExt = 1;
  316. extIndex = index;
  317. newExtIndex = newIndex;
  318. }
  319. }
  320. if (newIndex < newLen)
  321. newName[newIndex++] = curr;
  322. else
  323. needsCRC = 1;
  324. }
  325. }
  326. if (needsCRC) {
  327. uint8_t ext[EXT_SIZE];
  328. int localExtIndex = 0;
  329. if (hasExt) {
  330. int maxFilenameLen;
  331. for (index = 0;
  332. index < EXT_SIZE && extIndex + index + 1 < udfLen;
  333. index++) {
  334. curr = udfName[extIndex + index + 1];
  335. if (curr == '/' || curr == 0) {
  336. needsCRC = 1;
  337. curr = ILLEGAL_CHAR_MARK;
  338. while (extIndex + index + 2 < udfLen &&
  339. (index + 1 < EXT_SIZE &&
  340. (udfName[extIndex + index + 2] == '/' ||
  341. udfName[extIndex + index + 2] == 0)))
  342. index++;
  343. }
  344. ext[localExtIndex++] = curr;
  345. }
  346. maxFilenameLen = newLen - CRC_LEN - localExtIndex;
  347. if (newIndex > maxFilenameLen)
  348. newIndex = maxFilenameLen;
  349. else
  350. newIndex = newExtIndex;
  351. } else if (newIndex > newLen - CRC_LEN)
  352. newIndex = newLen - CRC_LEN;
  353. newName[newIndex++] = CRC_MARK;
  354. valueCRC = crc_itu_t(0, fidName, fidNameLen);
  355. newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
  356. newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
  357. newName[newIndex++] = hex_asc_upper_hi(valueCRC);
  358. newName[newIndex++] = hex_asc_upper_lo(valueCRC);
  359. if (hasExt) {
  360. newName[newIndex++] = EXT_MARK;
  361. for (index = 0; index < localExtIndex; index++)
  362. newName[newIndex++] = ext[index];
  363. }
  364. }
  365. return newIndex;
  366. }