Ruby  2.1.10p492(2016-04-01revision54464)
transcode.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  transcode.c -
4 
5  $Author: usa $
6  created at: Tue Oct 30 16:10:22 JST 2007
7 
8  Copyright (C) 2007 Martin Duerst
9 
10 **********************************************************************/
11 
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
14 #include "internal.h"
15 #include "transcode_data.h"
16 #include <ctype.h>
17 
18 #define ENABLE_ECONV_NEWLINE_OPTION 1
19 
20 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
24 
26 
32 #ifdef ENABLE_ECONV_NEWLINE_OPTION
34 #endif
36 
44 
45 static unsigned char *
46 allocate_converted_string(const char *sname, const char *dname,
47  const unsigned char *str, size_t len,
48  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
49  size_t *dst_len_ptr);
50 
51 /* dynamic structure, one per conversion (similar to iconv_t) */
52 /* may carry conversion state (e.g. for iso-2022-jp) */
53 typedef struct rb_transcoding {
55 
56  int flags;
57 
59  unsigned int next_table;
61  unsigned char next_byte;
62  unsigned int output_index;
63 
64  ssize_t recognized_len; /* already interpreted */
65  ssize_t readagain_len; /* not yet interpreted */
66  union {
67  unsigned char ary[8]; /* max_input <= sizeof(ary) */
68  unsigned char *ptr; /* length: max_input */
69  } readbuf; /* recognized_len + readagain_len used */
70 
71  ssize_t writebuf_off;
72  ssize_t writebuf_len;
73  union {
74  unsigned char ary[8]; /* max_output <= sizeof(ary) */
75  unsigned char *ptr; /* length: max_output */
76  } writebuf;
77 
78  union rb_transcoding_state_t { /* opaque data for stateful encoding */
79  void *ptr;
80  char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
82  } state;
84 #define TRANSCODING_READBUF(tc) \
85  ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
86  (tc)->readbuf.ary : \
87  (tc)->readbuf.ptr)
88 #define TRANSCODING_WRITEBUF(tc) \
89  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
90  (tc)->writebuf.ary : \
91  (tc)->writebuf.ptr)
92 #define TRANSCODING_WRITEBUF_SIZE(tc) \
93  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
94  sizeof((tc)->writebuf.ary) : \
95  (size_t)(tc)->transcoder->max_output)
96 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
97 #define TRANSCODING_STATE(tc) \
98  ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
99  (tc)->state.ary : \
100  (tc)->state.ptr)
101 
102 typedef struct {
104  unsigned char *out_buf_start;
105  unsigned char *out_data_start;
106  unsigned char *out_data_end;
107  unsigned char *out_buf_end;
110 
111 struct rb_econv_t {
112  int flags;
113  const char *source_encoding_name;
115 
116  int started;
117 
118  const unsigned char *replacement_str;
120  const char *replacement_enc;
122 
123  unsigned char *in_buf_start;
124  unsigned char *in_data_start;
125  unsigned char *in_data_end;
126  unsigned char *in_buf_end;
132 
133  /* last error */
134  struct {
137  const char *source_encoding;
138  const char *destination_encoding;
139  const unsigned char *error_bytes_start;
142  } last_error;
143 
144  /* The following fields are only for Encoding::Converter.
145  * rb_econv_open set them NULL. */
148 };
149 
150 /*
151  * Dispatch data and logic
152  */
153 
154 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
155 
156 typedef struct {
157  const char *sname;
158  const char *dname;
159  const char *lib; /* null means means no need to load a library */
162 
164 
165 static transcoder_entry_t *
166 make_transcoder_entry(const char *sname, const char *dname)
167 {
168  st_data_t val;
169  st_table *table2;
170 
171  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
174  }
175  table2 = (st_table *)val;
176  if (!st_lookup(table2, (st_data_t)dname, &val)) {
178  entry->sname = sname;
179  entry->dname = dname;
180  entry->lib = NULL;
181  entry->transcoder = NULL;
182  val = (st_data_t)entry;
183  st_add_direct(table2, (st_data_t)dname, val);
184  }
185  return (transcoder_entry_t *)val;
186 }
187 
188 static transcoder_entry_t *
189 get_transcoder_entry(const char *sname, const char *dname)
190 {
191  st_data_t val;
192  st_table *table2;
193 
194  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
195  return NULL;
196  }
197  table2 = (st_table *)val;
198  if (!st_lookup(table2, (st_data_t)dname, &val)) {
199  return NULL;
200  }
201  return (transcoder_entry_t *)val;
202 }
203 
204 void
206 {
207  const char *const sname = tr->src_encoding;
208  const char *const dname = tr->dst_encoding;
209 
210  transcoder_entry_t *entry;
211 
212  entry = make_transcoder_entry(sname, dname);
213  if (entry->transcoder) {
214  rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
215  sname, dname);
216  }
217 
218  entry->transcoder = tr;
219 }
220 
221 static void
222 declare_transcoder(const char *sname, const char *dname, const char *lib)
223 {
224  transcoder_entry_t *entry;
225 
226  entry = make_transcoder_entry(sname, dname);
227  entry->lib = lib;
228 }
229 
230 static const char transcoder_lib_prefix[] = "enc/trans/";
231 
232 void
233 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
234 {
235  if (!lib) {
236  rb_raise(rb_eArgError, "invalid library name - (null)");
237  }
238  declare_transcoder(enc1, enc2, lib);
239 }
240 
241 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
242 
243 typedef struct search_path_queue_tag {
245  const char *enc;
247 
248 typedef struct {
252  const char *base_enc;
254 
255 static int
257 {
258  const char *dname = (const char *)key;
259  search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
261 
262  if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
263  return ST_CONTINUE;
264  }
265 
267  q->enc = dname;
268  q->next = NULL;
269  *bfs->queue_last_ptr = q;
270  bfs->queue_last_ptr = &q->next;
271 
272  st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
273  return ST_CONTINUE;
274 }
275 
276 static int
277 transcode_search_path(const char *sname, const char *dname,
278  void (*callback)(const char *sname, const char *dname, int depth, void *arg),
279  void *arg)
280 {
281  search_path_bfs_t bfs;
283  st_data_t val;
284  st_table *table2;
285  int found;
286  int pathlen = -1;
287 
288  if (encoding_equal(sname, dname))
289  return -1;
290 
292  q->enc = sname;
293  q->next = NULL;
294  bfs.queue_last_ptr = &q->next;
295  bfs.queue = q;
296 
299 
300  while (bfs.queue) {
301  q = bfs.queue;
302  bfs.queue = q->next;
303  if (!bfs.queue)
304  bfs.queue_last_ptr = &bfs.queue;
305 
306  if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
307  xfree(q);
308  continue;
309  }
310  table2 = (st_table *)val;
311 
312  if (st_lookup(table2, (st_data_t)dname, &val)) {
313  st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
314  xfree(q);
315  found = 1;
316  goto cleanup;
317  }
318 
319  bfs.base_enc = q->enc;
321  bfs.base_enc = NULL;
322 
323  xfree(q);
324  }
325  found = 0;
326 
327  cleanup:
328  while (bfs.queue) {
329  q = bfs.queue;
330  bfs.queue = q->next;
331  xfree(q);
332  }
333 
334  if (found) {
335  const char *enc = dname;
336  int depth;
337  pathlen = 0;
338  while (1) {
339  st_lookup(bfs.visited, (st_data_t)enc, &val);
340  if (!val)
341  break;
342  pathlen++;
343  enc = (const char *)val;
344  }
345  depth = pathlen;
346  enc = dname;
347  while (1) {
348  st_lookup(bfs.visited, (st_data_t)enc, &val);
349  if (!val)
350  break;
351  callback((const char *)val, enc, --depth, arg);
352  enc = (const char *)val;
353  }
354  }
355 
356  st_free_table(bfs.visited);
357 
358  return pathlen; /* is -1 if not found */
359 }
360 
361 static const rb_transcoder *
363 {
364  if (entry->transcoder)
365  return entry->transcoder;
366 
367  if (entry->lib) {
368  const char *const lib = entry->lib;
369  const size_t len = strlen(lib);
370  const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
371  const VALUE fn = rb_str_new(0, total_len);
372  char *const path = RSTRING_PTR(fn);
373  const int safe = rb_safe_level();
374 
375  memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
376  memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
377  rb_str_set_len(fn, total_len);
378  FL_UNSET(fn, FL_TAINT);
379  OBJ_FREEZE(fn);
380  rb_require_safe(fn, safe > 3 ? 3 : safe);
381  }
382 
383  if (entry->transcoder)
384  return entry->transcoder;
385 
386  return NULL;
387 }
388 
389 static const char*
390 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
391 {
392  if (encoding_equal(encname, "UTF-8")) {
393  *len_ret = 3;
394  *repl_encname_ptr = "UTF-8";
395  return "\xEF\xBF\xBD";
396  }
397  else {
398  *len_ret = 1;
399  *repl_encname_ptr = "US-ASCII";
400  return "?";
401  }
402 }
403 
404 /*
405  * Transcoding engine logic
406  */
407 
408 static const unsigned char *
410  const unsigned char *in_start,
411  const unsigned char *inchar_start,
412  const unsigned char *in_p,
413  size_t *char_len_ptr)
414 {
415  const unsigned char *ptr;
416  if (inchar_start - in_start < tc->recognized_len) {
418  inchar_start, unsigned char, in_p - inchar_start);
419  ptr = TRANSCODING_READBUF(tc);
420  }
421  else {
422  ptr = inchar_start - tc->recognized_len;
423  }
424  *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
425  return ptr;
426 }
427 
428 static rb_econv_result_t
429 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
430  const unsigned char *in_stop, unsigned char *out_stop,
431  rb_transcoding *tc,
432  const int opt)
433 {
434  const rb_transcoder *tr = tc->transcoder;
435  int unitlen = tr->input_unit_length;
436  ssize_t readagain_len = 0;
437 
438  const unsigned char *inchar_start;
439  const unsigned char *in_p;
440 
441  unsigned char *out_p;
442 
443  in_p = inchar_start = *in_pos;
444 
445  out_p = *out_pos;
446 
447 #define SUSPEND(ret, num) \
448  do { \
449  tc->resume_position = (num); \
450  if (0 < in_p - inchar_start) \
451  MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
452  inchar_start, unsigned char, in_p - inchar_start); \
453  *in_pos = in_p; \
454  *out_pos = out_p; \
455  tc->recognized_len += in_p - inchar_start; \
456  if (readagain_len) { \
457  tc->recognized_len -= readagain_len; \
458  tc->readagain_len = readagain_len; \
459  } \
460  return (ret); \
461  resume_label ## num:; \
462  } while (0)
463 #define SUSPEND_OBUF(num) \
464  do { \
465  while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
466  } while (0)
467 
468 #define SUSPEND_AFTER_OUTPUT(num) \
469  if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
470  SUSPEND(econv_after_output, num); \
471  }
472 
473 #define next_table (tc->next_table)
474 #define next_info (tc->next_info)
475 #define next_byte (tc->next_byte)
476 #define writebuf_len (tc->writebuf_len)
477 #define writebuf_off (tc->writebuf_off)
478 
479  switch (tc->resume_position) {
480  case 0: break;
481  case 1: goto resume_label1;
482  case 2: goto resume_label2;
483  case 3: goto resume_label3;
484  case 4: goto resume_label4;
485  case 5: goto resume_label5;
486  case 6: goto resume_label6;
487  case 7: goto resume_label7;
488  case 8: goto resume_label8;
489  case 9: goto resume_label9;
490  case 10: goto resume_label10;
491  case 11: goto resume_label11;
492  case 12: goto resume_label12;
493  case 13: goto resume_label13;
494  case 14: goto resume_label14;
495  case 15: goto resume_label15;
496  case 16: goto resume_label16;
497  case 17: goto resume_label17;
498  case 18: goto resume_label18;
499  case 19: goto resume_label19;
500  case 20: goto resume_label20;
501  case 21: goto resume_label21;
502  case 22: goto resume_label22;
503  case 23: goto resume_label23;
504  case 24: goto resume_label24;
505  case 25: goto resume_label25;
506  case 26: goto resume_label26;
507  case 27: goto resume_label27;
508  case 28: goto resume_label28;
509  case 29: goto resume_label29;
510  case 30: goto resume_label30;
511  case 31: goto resume_label31;
512  case 32: goto resume_label32;
513  case 33: goto resume_label33;
514  case 34: goto resume_label34;
515  }
516 
517  while (1) {
518  inchar_start = in_p;
519  tc->recognized_len = 0;
520  next_table = tr->conv_tree_start;
521 
523 
524  if (in_stop <= in_p) {
525  if (!(opt & ECONV_PARTIAL_INPUT))
526  break;
528  continue;
529  }
530 
531 #define BYTE_ADDR(index) (tr->byte_array + (index))
532 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
533 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
534 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
535 #define BL_MIN_BYTE (BL_BASE[0])
536 #define BL_MAX_BYTE (BL_BASE[1])
537 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
538 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
539 
540  next_byte = (unsigned char)*in_p++;
541  follow_byte:
543  next_info = INVALID;
544  else {
546  }
547  follow_info:
548  switch (next_info & 0x1F) {
549  case NOMAP:
550  {
551  const unsigned char *p = inchar_start;
552  writebuf_off = 0;
553  while (p < in_p) {
554  TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
555  }
557  writebuf_off = 0;
558  while (writebuf_off < writebuf_len) {
559  SUSPEND_OBUF(3);
560  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
561  }
562  }
563  continue;
564  case 0x00: case 0x04: case 0x08: case 0x0C:
565  case 0x10: case 0x14: case 0x18: case 0x1C:
567  while (in_p >= in_stop) {
568  if (!(opt & ECONV_PARTIAL_INPUT))
569  goto incomplete;
571  }
572  next_byte = (unsigned char)*in_p++;
573  next_table = (unsigned int)next_info;
574  goto follow_byte;
575  case ZERObt: /* drop input */
576  continue;
577  case ONEbt:
578  SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
579  continue;
580  case TWObt:
581  SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
582  SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
583  continue;
584  case THREEbt:
585  SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
586  SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
587  SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
588  continue;
589  case FOURbt:
590  SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
591  SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
592  SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
593  SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
594  continue;
595  case GB4bt:
596  SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
597  SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
598  SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
599  SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
600  continue;
601  case STR1:
602  tc->output_index = 0;
605  tc->output_index++;
606  }
607  continue;
608  case FUNii:
609  next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
610  goto follow_info;
611  case FUNsi:
612  {
613  const unsigned char *char_start;
614  size_t char_len;
615  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
616  next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
617  goto follow_info;
618  }
619  case FUNio:
620  SUSPEND_OBUF(13);
621  if (tr->max_output <= out_stop - out_p)
622  out_p += tr->func_io(TRANSCODING_STATE(tc),
623  next_info, out_p, out_stop - out_p);
624  else {
625  writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
626  next_info,
628  writebuf_off = 0;
629  while (writebuf_off < writebuf_len) {
630  SUSPEND_OBUF(20);
631  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
632  }
633  }
634  break;
635  case FUNso:
636  {
637  const unsigned char *char_start;
638  size_t char_len;
639  SUSPEND_OBUF(14);
640  if (tr->max_output <= out_stop - out_p) {
641  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
642  out_p += tr->func_so(TRANSCODING_STATE(tc),
643  char_start, (size_t)char_len,
644  out_p, out_stop - out_p);
645  }
646  else {
647  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
648  writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
649  char_start, (size_t)char_len,
651  writebuf_off = 0;
652  while (writebuf_off < writebuf_len) {
653  SUSPEND_OBUF(22);
654  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
655  }
656  }
657  break;
658  }
659  case FUNsio:
660  {
661  const unsigned char *char_start;
662  size_t char_len;
663  SUSPEND_OBUF(33);
664  if (tr->max_output <= out_stop - out_p) {
665  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
666  out_p += tr->func_sio(TRANSCODING_STATE(tc),
667  char_start, (size_t)char_len, next_info,
668  out_p, out_stop - out_p);
669  }
670  else {
671  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
672  writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
673  char_start, (size_t)char_len, next_info,
675  writebuf_off = 0;
676  while (writebuf_off < writebuf_len) {
677  SUSPEND_OBUF(34);
678  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
679  }
680  }
681  break;
682  }
683  case INVALID:
684  if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
685  if (tc->recognized_len + (in_p - inchar_start) < unitlen)
687  while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
688  in_p = in_stop;
690  }
691  if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
692  in_p = in_stop;
693  }
694  else {
695  in_p = inchar_start + (unitlen - tc->recognized_len);
696  }
697  }
698  else {
699  ssize_t invalid_len; /* including the last byte which causes invalid */
700  ssize_t discard_len;
701  invalid_len = tc->recognized_len + (in_p - inchar_start);
702  discard_len = ((invalid_len - 1) / unitlen) * unitlen;
703  readagain_len = invalid_len - discard_len;
704  }
705  goto invalid;
706  case UNDEF:
707  goto undef;
708  default:
709  rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
710  }
711  continue;
712 
713  invalid:
715  continue;
716 
717  incomplete:
719  continue;
720 
721  undef:
723  continue;
724  }
725 
726  /* cleanup */
727  if (tr->finish_func) {
728  SUSPEND_OBUF(4);
729  if (tr->max_output <= out_stop - out_p) {
730  out_p += tr->finish_func(TRANSCODING_STATE(tc),
731  out_p, out_stop - out_p);
732  }
733  else {
734  writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
736  writebuf_off = 0;
737  while (writebuf_off < writebuf_len) {
738  SUSPEND_OBUF(23);
739  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
740  }
741  }
742  }
743  while (1)
745 #undef SUSPEND
746 #undef next_table
747 #undef next_info
748 #undef next_byte
749 #undef writebuf_len
750 #undef writebuf_off
751 }
752 
753 static rb_econv_result_t
754 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
755  const unsigned char *in_stop, unsigned char *out_stop,
756  rb_transcoding *tc,
757  const int opt)
758 {
759  if (tc->readagain_len) {
760  unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
761  const unsigned char *readagain_pos = readagain_buf;
762  const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
763  rb_econv_result_t res;
764 
765  MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
766  unsigned char, tc->readagain_len);
767  tc->readagain_len = 0;
768  res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
769  if (res != econv_source_buffer_empty) {
771  readagain_pos, unsigned char, readagain_stop - readagain_pos);
772  tc->readagain_len += readagain_stop - readagain_pos;
773  return res;
774  }
775  }
776  return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
777 }
778 
779 static rb_transcoding *
781 {
782  rb_transcoding *tc;
783 
784  tc = ALLOC(rb_transcoding);
785  tc->transcoder = tr;
786  tc->flags = flags;
787  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
788  tc->state.ptr = xmalloc(tr->state_size);
789  if (tr->state_init_func) {
790  (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
791  }
792  tc->resume_position = 0;
793  tc->recognized_len = 0;
794  tc->readagain_len = 0;
795  tc->writebuf_len = 0;
796  tc->writebuf_off = 0;
797  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
798  tc->readbuf.ptr = xmalloc(tr->max_input);
799  }
800  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
801  tc->writebuf.ptr = xmalloc(tr->max_output);
802  }
803  return tc;
804 }
805 
806 static rb_econv_result_t
808  const unsigned char **input_ptr, const unsigned char *input_stop,
809  unsigned char **output_ptr, unsigned char *output_stop,
810  int flags)
811 {
812  return transcode_restartable(
813  input_ptr, output_ptr,
814  input_stop, output_stop,
815  tc, flags);
816 }
817 
818 static void
820 {
821  const rb_transcoder *tr = tc->transcoder;
822  if (tr->state_fini_func) {
823  (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
824  }
825  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
826  xfree(tc->state.ptr);
827  if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
828  xfree(tc->readbuf.ptr);
829  if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
830  xfree(tc->writebuf.ptr);
831  xfree(tc);
832 }
833 
834 static size_t
836 {
837  size_t size = sizeof(rb_transcoding);
838  const rb_transcoder *tr = tc->transcoder;
839 
840  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
841  size += tr->state_size;
842  }
843  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
844  size += tr->max_input;
845  }
846  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
847  size += tr->max_output;
848  }
849  return size;
850 }
851 
852 static rb_econv_t *
853 rb_econv_alloc(int n_hint)
854 {
855  rb_econv_t *ec;
856 
857  if (n_hint <= 0)
858  n_hint = 1;
859 
860  ec = ALLOC(rb_econv_t);
861  ec->flags = 0;
864  ec->started = 0;
865  ec->replacement_str = NULL;
866  ec->replacement_len = 0;
867  ec->replacement_enc = NULL;
868  ec->replacement_allocated = 0;
869  ec->in_buf_start = NULL;
870  ec->in_data_start = NULL;
871  ec->in_data_end = NULL;
872  ec->in_buf_end = NULL;
873  ec->num_allocated = n_hint;
874  ec->num_trans = 0;
876  ec->num_finished = 0;
877  ec->last_tc = NULL;
879  ec->last_error.error_tc = NULL;
883  ec->last_error.error_bytes_len = 0;
884  ec->last_error.readagain_len = 0;
885  ec->source_encoding = NULL;
887  return ec;
888 }
889 
890 static int
892 {
893  int n, j;
894  int bufsize = 4096;
895  unsigned char *p;
896 
897  if (ec->num_trans == ec->num_allocated) {
898  n = ec->num_allocated * 2;
900  ec->num_allocated = n;
901  }
902 
903  p = xmalloc(bufsize);
904 
905  MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
906 
908  ec->elems[i].out_buf_start = p;
909  ec->elems[i].out_buf_end = p + bufsize;
910  ec->elems[i].out_data_start = p;
911  ec->elems[i].out_data_end = p;
913 
914  ec->num_trans++;
915 
916  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
917  for (j = ec->num_trans-1; i <= j; j--) {
918  rb_transcoding *tc = ec->elems[j].tc;
919  const rb_transcoder *tr2 = tc->transcoder;
920  if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
921  ec->last_tc = tc;
922  break;
923  }
924  }
925 
926  return 0;
927 }
928 
929 static rb_econv_t *
931 {
932  rb_econv_t *ec;
933  int i, ret;
934 
935  for (i = 0; i < n; i++) {
936  const rb_transcoder *tr;
937  tr = load_transcoder_entry(entries[i]);
938  if (!tr)
939  return NULL;
940  }
941 
942  ec = rb_econv_alloc(n);
943 
944  for (i = 0; i < n; i++) {
945  const rb_transcoder *tr = load_transcoder_entry(entries[i]);
946  ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
947  if (ret == -1) {
948  rb_econv_close(ec);
949  return NULL;
950  }
951  }
952 
953  return ec;
954 }
955 
956 struct trans_open_t {
959 };
960 
961 static void
962 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
963 {
964  struct trans_open_t *toarg = arg;
965 
966  if (!toarg->entries) {
967  toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
968  }
969  toarg->entries[depth] = get_transcoder_entry(sname, dname);
970 }
971 
972 static rb_econv_t *
973 rb_econv_open0(const char *sname, const char *dname, int ecflags)
974 {
976  int num_trans;
977  rb_econv_t *ec;
978 
979  int sidx, didx;
980 
981  if (*sname) {
982  sidx = rb_enc_find_index(sname);
983  if (0 <= sidx) {
984  rb_enc_from_index(sidx);
985  }
986  }
987 
988  if (*dname) {
989  didx = rb_enc_find_index(dname);
990  if (0 <= didx) {
991  rb_enc_from_index(didx);
992  }
993  }
994 
995  if (*sname == '\0' && *dname == '\0') {
996  num_trans = 0;
997  entries = NULL;
998  sname = dname = "";
999  }
1000  else {
1001  struct trans_open_t toarg;
1002  toarg.entries = NULL;
1003  toarg.num_additional = 0;
1004  num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1005  entries = toarg.entries;
1006  if (num_trans < 0) {
1007  xfree(entries);
1008  return NULL;
1009  }
1010  }
1011 
1013  xfree(entries);
1014  if (!ec)
1015  return NULL;
1016 
1017  ec->flags = ecflags;
1018  ec->source_encoding_name = sname;
1019  ec->destination_encoding_name = dname;
1020 
1021  return ec;
1022 }
1023 
1024 #define MAX_ECFLAGS_DECORATORS 32
1025 
1026 static int
1027 decorator_names(int ecflags, const char **decorators_ret)
1028 {
1029  int num_decorators;
1030 
1031  switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1035  case 0:
1036  break;
1037  default:
1038  return -1;
1039  }
1040 
1041  if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1043  return -1;
1044 
1045  num_decorators = 0;
1046 
1047  if (ecflags & ECONV_XML_TEXT_DECORATOR)
1048  decorators_ret[num_decorators++] = "xml_text_escape";
1049  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
1050  decorators_ret[num_decorators++] = "xml_attr_content_escape";
1051  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1052  decorators_ret[num_decorators++] = "xml_attr_quote";
1053 
1054  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1055  decorators_ret[num_decorators++] = "crlf_newline";
1056  if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1057  decorators_ret[num_decorators++] = "cr_newline";
1058  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
1059  decorators_ret[num_decorators++] = "universal_newline";
1060 
1061  return num_decorators;
1062 }
1063 
1064 rb_econv_t *
1065 rb_econv_open(const char *sname, const char *dname, int ecflags)
1066 {
1067  rb_econv_t *ec;
1068  int num_decorators;
1069  const char *decorators[MAX_ECFLAGS_DECORATORS];
1070  int i;
1071 
1072  num_decorators = decorator_names(ecflags, decorators);
1073  if (num_decorators == -1)
1074  return NULL;
1075 
1076  ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1077  if (!ec)
1078  return NULL;
1079 
1080  for (i = 0; i < num_decorators; i++)
1081  if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1082  rb_econv_close(ec);
1083  return NULL;
1084  }
1085 
1086  ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1087 
1088  return ec;
1089 }
1090 
1091 static int
1093  const unsigned char **input_ptr, const unsigned char *input_stop,
1094  unsigned char **output_ptr, unsigned char *output_stop,
1095  int flags,
1096  int start)
1097 {
1098  int try;
1099  int i, f;
1100 
1101  const unsigned char **ipp, *is, *iold;
1102  unsigned char **opp, *os, *oold;
1103  rb_econv_result_t res;
1104 
1105  try = 1;
1106  while (try) {
1107  try = 0;
1108  for (i = start; i < ec->num_trans; i++) {
1109  rb_econv_elem_t *te = &ec->elems[i];
1110 
1111  if (i == 0) {
1112  ipp = input_ptr;
1113  is = input_stop;
1114  }
1115  else {
1116  rb_econv_elem_t *prev_te = &ec->elems[i-1];
1117  ipp = (const unsigned char **)&prev_te->out_data_start;
1118  is = prev_te->out_data_end;
1119  }
1120 
1121  if (i == ec->num_trans-1) {
1122  opp = output_ptr;
1123  os = output_stop;
1124  }
1125  else {
1126  if (te->out_buf_start != te->out_data_start) {
1127  ssize_t len = te->out_data_end - te->out_data_start;
1128  ssize_t off = te->out_data_start - te->out_buf_start;
1129  MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1130  te->out_data_start = te->out_buf_start;
1131  te->out_data_end -= off;
1132  }
1133  opp = &te->out_data_end;
1134  os = te->out_buf_end;
1135  }
1136 
1137  f = flags;
1138  if (ec->num_finished != i)
1140  if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1141  start = 1;
1142  flags &= ~ECONV_AFTER_OUTPUT;
1143  }
1144  if (i != 0)
1145  f &= ~ECONV_AFTER_OUTPUT;
1146  iold = *ipp;
1147  oold = *opp;
1148  te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1149  if (iold != *ipp || oold != *opp)
1150  try = 1;
1151 
1152  switch (res) {
1156  case econv_after_output:
1157  return i;
1158 
1161  break;
1162 
1163  case econv_finished:
1164  ec->num_finished = i+1;
1165  break;
1166  }
1167  }
1168  }
1169  return -1;
1170 }
1171 
1172 static rb_econv_result_t
1174  const unsigned char **input_ptr, const unsigned char *input_stop,
1175  unsigned char **output_ptr, unsigned char *output_stop,
1176  int flags,
1177  int *result_position_ptr)
1178 {
1179  int i;
1180  int needreport_index;
1181  int sweep_start;
1182 
1183  unsigned char empty_buf;
1184  unsigned char *empty_ptr = &empty_buf;
1185 
1186  if (!input_ptr) {
1187  input_ptr = (const unsigned char **)&empty_ptr;
1188  input_stop = empty_ptr;
1189  }
1190 
1191  if (!output_ptr) {
1192  output_ptr = &empty_ptr;
1193  output_stop = empty_ptr;
1194  }
1195 
1196  if (ec->elems[0].last_result == econv_after_output)
1198 
1199  needreport_index = -1;
1200  for (i = ec->num_trans-1; 0 <= i; i--) {
1201  switch (ec->elems[i].last_result) {
1205  case econv_after_output:
1206  case econv_finished:
1207  sweep_start = i+1;
1208  needreport_index = i;
1209  goto found_needreport;
1210 
1213  break;
1214 
1215  default:
1216  rb_bug("unexpected transcode last result");
1217  }
1218  }
1219 
1220  /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1221 
1223  (flags & ECONV_AFTER_OUTPUT)) {
1224  rb_econv_result_t res;
1225 
1226  res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1228  result_position_ptr);
1229 
1230  if (res == econv_source_buffer_empty)
1231  return econv_after_output;
1232  return res;
1233  }
1234 
1235  sweep_start = 0;
1236 
1237  found_needreport:
1238 
1239  do {
1240  needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1241  sweep_start = needreport_index + 1;
1242  } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1243 
1244  for (i = ec->num_trans-1; 0 <= i; i--) {
1246  rb_econv_result_t res = ec->elems[i].last_result;
1247  if (res == econv_invalid_byte_sequence ||
1248  res == econv_incomplete_input ||
1249  res == econv_undefined_conversion ||
1250  res == econv_after_output) {
1252  }
1253  if (result_position_ptr)
1254  *result_position_ptr = i;
1255  return res;
1256  }
1257  }
1258  if (result_position_ptr)
1259  *result_position_ptr = -1;
1261 }
1262 
1263 static rb_econv_result_t
1265  const unsigned char **input_ptr, const unsigned char *input_stop,
1266  unsigned char **output_ptr, unsigned char *output_stop,
1267  int flags)
1268 {
1269  rb_econv_result_t res;
1270  int result_position;
1271  int has_output = 0;
1272 
1273  memset(&ec->last_error, 0, sizeof(ec->last_error));
1274 
1275  if (ec->num_trans == 0) {
1276  size_t len;
1277  if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1278  if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1279  len = output_stop - *output_ptr;
1280  memcpy(*output_ptr, ec->in_data_start, len);
1281  *output_ptr = output_stop;
1282  ec->in_data_start += len;
1284  goto gotresult;
1285  }
1286  len = ec->in_data_end - ec->in_data_start;
1287  memcpy(*output_ptr, ec->in_data_start, len);
1288  *output_ptr += len;
1289  ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1290  if (flags & ECONV_AFTER_OUTPUT) {
1291  res = econv_after_output;
1292  goto gotresult;
1293  }
1294  }
1295  if (output_stop - *output_ptr < input_stop - *input_ptr) {
1296  len = output_stop - *output_ptr;
1297  }
1298  else {
1299  len = input_stop - *input_ptr;
1300  }
1301  if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1302  *(*output_ptr)++ = *(*input_ptr)++;
1303  res = econv_after_output;
1304  goto gotresult;
1305  }
1306  memcpy(*output_ptr, *input_ptr, len);
1307  *output_ptr += len;
1308  *input_ptr += len;
1309  if (*input_ptr != input_stop)
1311  else if (flags & ECONV_PARTIAL_INPUT)
1313  else
1314  res = econv_finished;
1315  goto gotresult;
1316  }
1317 
1318  if (ec->elems[ec->num_trans-1].out_data_start) {
1319  unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1320  unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1321  if (data_start != data_end) {
1322  size_t len;
1323  if (output_stop - *output_ptr < data_end - data_start) {
1324  len = output_stop - *output_ptr;
1325  memcpy(*output_ptr, data_start, len);
1326  *output_ptr = output_stop;
1327  ec->elems[ec->num_trans-1].out_data_start += len;
1329  goto gotresult;
1330  }
1331  len = data_end - data_start;
1332  memcpy(*output_ptr, data_start, len);
1333  *output_ptr += len;
1334  ec->elems[ec->num_trans-1].out_data_start =
1335  ec->elems[ec->num_trans-1].out_data_end =
1336  ec->elems[ec->num_trans-1].out_buf_start;
1337  has_output = 1;
1338  }
1339  }
1340 
1341  if (ec->in_buf_start &&
1342  ec->in_data_start != ec->in_data_end) {
1343  res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1344  (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1345  if (res != econv_source_buffer_empty)
1346  goto gotresult;
1347  }
1348 
1349  if (has_output &&
1350  (flags & ECONV_AFTER_OUTPUT) &&
1351  *input_ptr != input_stop) {
1352  input_stop = *input_ptr;
1353  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1354  if (res == econv_source_buffer_empty)
1355  res = econv_after_output;
1356  }
1357  else if ((flags & ECONV_AFTER_OUTPUT) ||
1358  ec->num_trans == 1) {
1359  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1360  }
1361  else {
1362  flags |= ECONV_AFTER_OUTPUT;
1363  do {
1364  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1365  } while (res == econv_after_output);
1366  }
1367 
1368  gotresult:
1369  ec->last_error.result = res;
1370  if (res == econv_invalid_byte_sequence ||
1371  res == econv_incomplete_input ||
1372  res == econv_undefined_conversion) {
1373  rb_transcoding *error_tc = ec->elems[result_position].tc;
1374  ec->last_error.error_tc = error_tc;
1378  ec->last_error.error_bytes_len = error_tc->recognized_len;
1379  ec->last_error.readagain_len = error_tc->readagain_len;
1380  }
1381 
1382  return res;
1383 }
1384 
1386 
1387 static int
1389 {
1390  int ret;
1391  unsigned char utfbuf[1024];
1392  const unsigned char *utf;
1393  size_t utf_len;
1394  int utf_allocated = 0;
1395  char charef_buf[16];
1396  const unsigned char *p;
1397 
1398  if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1399  utf = ec->last_error.error_bytes_start;
1400  utf_len = ec->last_error.error_bytes_len;
1401  }
1402  else {
1405  utfbuf, sizeof(utfbuf),
1406  &utf_len);
1407  if (!utf)
1408  return -1;
1409  if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1410  utf_allocated = 1;
1411  }
1412 
1413  if (utf_len % 4 != 0)
1414  goto fail;
1415 
1416  p = utf;
1417  while (4 <= utf_len) {
1418  unsigned int u = 0;
1419  u += p[0] << 24;
1420  u += p[1] << 16;
1421  u += p[2] << 8;
1422  u += p[3];
1423  snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1424 
1425  ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1426  if (ret == -1)
1427  goto fail;
1428 
1429  p += 4;
1430  utf_len -= 4;
1431  }
1432 
1433  if (utf_allocated)
1434  xfree((void *)utf);
1435  return 0;
1436 
1437  fail:
1438  if (utf_allocated)
1439  xfree((void *)utf);
1440  return -1;
1441 }
1442 
1445  const unsigned char **input_ptr, const unsigned char *input_stop,
1446  unsigned char **output_ptr, unsigned char *output_stop,
1447  int flags)
1448 {
1449  rb_econv_result_t ret;
1450 
1451  unsigned char empty_buf;
1452  unsigned char *empty_ptr = &empty_buf;
1453 
1454  ec->started = 1;
1455 
1456  if (!input_ptr) {
1457  input_ptr = (const unsigned char **)&empty_ptr;
1458  input_stop = empty_ptr;
1459  }
1460 
1461  if (!output_ptr) {
1462  output_ptr = &empty_ptr;
1463  output_stop = empty_ptr;
1464  }
1465 
1466  resume:
1467  ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1468 
1469  if (ret == econv_invalid_byte_sequence ||
1470  ret == econv_incomplete_input) {
1471  /* deal with invalid byte sequence */
1472  /* todo: add more alternative behaviors */
1473  switch (ec->flags & ECONV_INVALID_MASK) {
1474  case ECONV_INVALID_REPLACE:
1475  if (output_replacement_character(ec) == 0)
1476  goto resume;
1477  }
1478  }
1479 
1480  if (ret == econv_undefined_conversion) {
1481  /* valid character in source encoding
1482  * but no related character(s) in destination encoding */
1483  /* todo: add more alternative behaviors */
1484  switch (ec->flags & ECONV_UNDEF_MASK) {
1485  case ECONV_UNDEF_REPLACE:
1486  if (output_replacement_character(ec) == 0)
1487  goto resume;
1488  break;
1489 
1491  if (output_hex_charref(ec) == 0)
1492  goto resume;
1493  break;
1494  }
1495  }
1496 
1497  return ret;
1498 }
1499 
1500 const char *
1502 {
1503  rb_transcoding *tc = ec->last_tc;
1504  const rb_transcoder *tr;
1505 
1506  if (tc == NULL)
1507  return "";
1508 
1509  tr = tc->transcoder;
1510 
1511  if (tr->asciicompat_type == asciicompat_encoder)
1512  return tr->src_encoding;
1513  return tr->dst_encoding;
1514 }
1515 
1516 static unsigned char *
1517 allocate_converted_string(const char *sname, const char *dname,
1518  const unsigned char *str, size_t len,
1519  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1520  size_t *dst_len_ptr)
1521 {
1522  unsigned char *dst_str;
1523  size_t dst_len;
1524  size_t dst_bufsize;
1525 
1526  rb_econv_t *ec;
1527  rb_econv_result_t res;
1528 
1529  const unsigned char *sp;
1530  unsigned char *dp;
1531 
1532  if (caller_dst_buf)
1533  dst_bufsize = caller_dst_bufsize;
1534  else if (len == 0)
1535  dst_bufsize = 1;
1536  else
1537  dst_bufsize = len;
1538 
1539  ec = rb_econv_open(sname, dname, 0);
1540  if (ec == NULL)
1541  return NULL;
1542  if (caller_dst_buf)
1543  dst_str = caller_dst_buf;
1544  else
1545  dst_str = xmalloc(dst_bufsize);
1546  dst_len = 0;
1547  sp = str;
1548  dp = dst_str+dst_len;
1549  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1550  dst_len = dp - dst_str;
1551  while (res == econv_destination_buffer_full) {
1552  if (SIZE_MAX/2 < dst_bufsize) {
1553  goto fail;
1554  }
1555  dst_bufsize *= 2;
1556  if (dst_str == caller_dst_buf) {
1557  unsigned char *tmp;
1558  tmp = xmalloc(dst_bufsize);
1559  memcpy(tmp, dst_str, dst_bufsize/2);
1560  dst_str = tmp;
1561  }
1562  else {
1563  dst_str = xrealloc(dst_str, dst_bufsize);
1564  }
1565  dp = dst_str+dst_len;
1566  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1567  dst_len = dp - dst_str;
1568  }
1569  if (res != econv_finished) {
1570  goto fail;
1571  }
1572  rb_econv_close(ec);
1573  *dst_len_ptr = dst_len;
1574  return dst_str;
1575 
1576  fail:
1577  if (dst_str != caller_dst_buf)
1578  xfree(dst_str);
1579  rb_econv_close(ec);
1580  return NULL;
1581 }
1582 
1583 /* result: 0:success -1:failure */
1584 int
1586  const unsigned char *str, size_t len, const char *str_encoding)
1587 {
1588  const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1589  unsigned char insert_buf[4096];
1590  const unsigned char *insert_str = NULL;
1591  size_t insert_len;
1592 
1593  int last_trans_index;
1594  rb_transcoding *tc;
1595 
1596  unsigned char **buf_start_p;
1597  unsigned char **data_start_p;
1598  unsigned char **data_end_p;
1599  unsigned char **buf_end_p;
1600 
1601  size_t need;
1602 
1603  ec->started = 1;
1604 
1605  if (len == 0)
1606  return 0;
1607 
1608  if (encoding_equal(insert_encoding, str_encoding)) {
1609  insert_str = str;
1610  insert_len = len;
1611  }
1612  else {
1613  insert_str = allocate_converted_string(str_encoding, insert_encoding,
1614  str, len, insert_buf, sizeof(insert_buf), &insert_len);
1615  if (insert_str == NULL)
1616  return -1;
1617  }
1618 
1619  need = insert_len;
1620 
1621  last_trans_index = ec->num_trans-1;
1622  if (ec->num_trans == 0) {
1623  tc = NULL;
1624  buf_start_p = &ec->in_buf_start;
1625  data_start_p = &ec->in_data_start;
1626  data_end_p = &ec->in_data_end;
1627  buf_end_p = &ec->in_buf_end;
1628  }
1629  else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1630  tc = ec->elems[last_trans_index].tc;
1631  need += tc->readagain_len;
1632  if (need < insert_len)
1633  goto fail;
1634  if (last_trans_index == 0) {
1635  buf_start_p = &ec->in_buf_start;
1636  data_start_p = &ec->in_data_start;
1637  data_end_p = &ec->in_data_end;
1638  buf_end_p = &ec->in_buf_end;
1639  }
1640  else {
1641  rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1642  buf_start_p = &ee->out_buf_start;
1643  data_start_p = &ee->out_data_start;
1644  data_end_p = &ee->out_data_end;
1645  buf_end_p = &ee->out_buf_end;
1646  }
1647  }
1648  else {
1649  rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1650  buf_start_p = &ee->out_buf_start;
1651  data_start_p = &ee->out_data_start;
1652  data_end_p = &ee->out_data_end;
1653  buf_end_p = &ee->out_buf_end;
1654  tc = ec->elems[last_trans_index].tc;
1655  }
1656 
1657  if (*buf_start_p == NULL) {
1658  unsigned char *buf = xmalloc(need);
1659  *buf_start_p = buf;
1660  *data_start_p = buf;
1661  *data_end_p = buf;
1662  *buf_end_p = buf+need;
1663  }
1664  else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1665  MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1666  *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1667  *data_start_p = *buf_start_p;
1668  if ((size_t)(*buf_end_p - *data_end_p) < need) {
1669  unsigned char *buf;
1670  size_t s = (*data_end_p - *buf_start_p) + need;
1671  if (s < need)
1672  goto fail;
1673  buf = xrealloc(*buf_start_p, s);
1674  *data_start_p = buf;
1675  *data_end_p = buf + (*data_end_p - *buf_start_p);
1676  *buf_start_p = buf;
1677  *buf_end_p = buf + s;
1678  }
1679  }
1680 
1681  memcpy(*data_end_p, insert_str, insert_len);
1682  *data_end_p += insert_len;
1683  if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1684  memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1685  *data_end_p += tc->readagain_len;
1686  tc->readagain_len = 0;
1687  }
1688 
1689  if (insert_str != str && insert_str != insert_buf)
1690  xfree((void*)insert_str);
1691  return 0;
1692 
1693  fail:
1694  if (insert_str != str && insert_str != insert_buf)
1695  xfree((void*)insert_str);
1696  return -1;
1697 }
1698 
1699 void
1701 {
1702  int i;
1703 
1704  if (ec->replacement_allocated) {
1705  xfree((void *)ec->replacement_str);
1706  }
1707  for (i = 0; i < ec->num_trans; i++) {
1708  rb_transcoding_close(ec->elems[i].tc);
1709  if (ec->elems[i].out_buf_start)
1710  xfree(ec->elems[i].out_buf_start);
1711  }
1712  xfree(ec->in_buf_start);
1713  xfree(ec->elems);
1714  xfree(ec);
1715 }
1716 
1717 size_t
1719 {
1720  size_t size = sizeof(rb_econv_t);
1721  int i;
1722 
1723  if (ec->replacement_allocated) {
1724  size += ec->replacement_len;
1725  }
1726  for (i = 0; i < ec->num_trans; i++) {
1727  size += rb_transcoding_memsize(ec->elems[i].tc);
1728 
1729  if (ec->elems[i].out_buf_start) {
1730  size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1731  }
1732  }
1733  size += ec->in_buf_end - ec->in_buf_start;
1734  size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1735 
1736  return size;
1737 }
1738 
1739 int
1741 {
1742  if (ec->num_trans == 0)
1743  return 0;
1744 #if SIZEOF_SIZE_T > SIZEOF_INT
1745  if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1746 #endif
1747  return (int)ec->elems[0].tc->readagain_len;
1748 }
1749 
1750 void
1751 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1752 {
1753  rb_transcoding *tc;
1754  if (ec->num_trans == 0 || n == 0)
1755  return;
1756  tc = ec->elems[0].tc;
1757  memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1758  tc->readagain_len -= n;
1759 }
1760 
1762  const char *ascii_compat_name;
1763  const char *ascii_incompat_name;
1764 };
1765 
1766 static int
1768 {
1769  struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1771  const rb_transcoder *tr;
1772 
1773  if (DECORATOR_P(entry->sname, entry->dname))
1774  return ST_CONTINUE;
1775  tr = load_transcoder_entry(entry);
1776  if (tr && tr->asciicompat_type == asciicompat_decoder) {
1777  data->ascii_compat_name = tr->dst_encoding;
1778  return ST_STOP;
1779  }
1780  return ST_CONTINUE;
1781 }
1782 
1783 const char *
1785 {
1786  st_data_t v;
1787  st_table *table2;
1788  struct asciicompat_encoding_t data;
1789 
1791  return NULL;
1792  table2 = (st_table *)v;
1793 
1794  /*
1795  * Assumption:
1796  * There is at most one transcoder for
1797  * converting from ASCII incompatible encoding.
1798  *
1799  * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1800  */
1801  if (table2->num_entries != 1)
1802  return NULL;
1803 
1805  data.ascii_compat_name = NULL;
1806  st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1807  return data.ascii_compat_name;
1808 }
1809 
1810 VALUE
1811 rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1812 {
1813  unsigned const char *sp, *se;
1814  unsigned char *ds, *dp, *de;
1815  rb_econv_result_t res;
1816  int max_output;
1817 
1818  if (NIL_P(dst)) {
1819  dst = rb_str_buf_new(len);
1820  if (ec->destination_encoding)
1822  }
1823 
1824  if (ec->last_tc)
1825  max_output = ec->last_tc->transcoder->max_output;
1826  else
1827  max_output = 1;
1828 
1829  do {
1830  long dlen = RSTRING_LEN(dst);
1831  if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1832  unsigned long new_capa = (unsigned long)dlen + len + max_output;
1833  if (LONG_MAX < new_capa)
1834  rb_raise(rb_eArgError, "too long string");
1835  rb_str_resize(dst, new_capa);
1836  rb_str_set_len(dst, dlen);
1837  }
1838  sp = (const unsigned char *)ss;
1839  se = sp + len;
1840  ds = (unsigned char *)RSTRING_PTR(dst);
1841  de = ds + rb_str_capacity(dst);
1842  dp = ds += dlen;
1843  res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1844  len -= (const char *)sp - ss;
1845  ss = (const char *)sp;
1846  rb_str_set_len(dst, dlen + (dp - ds));
1848  } while (res == econv_destination_buffer_full);
1849 
1850  return dst;
1851 }
1852 
1853 VALUE
1854 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1855 {
1856  src = rb_str_new_frozen(src);
1857  dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1858  RB_GC_GUARD(src);
1859  OBJ_INFECT_RAW(dst, src);
1860  return dst;
1861 }
1862 
1863 VALUE
1864 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
1865 {
1866  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1867 }
1868 
1869 VALUE
1870 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1871 {
1872  return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1873 }
1874 
1875 VALUE
1877 {
1878  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1879 }
1880 
1881 static int
1882 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1883 {
1884  transcoder_entry_t *entry;
1885  const rb_transcoder *tr;
1886 
1887  if (ec->started != 0)
1888  return -1;
1889 
1890  entry = get_transcoder_entry(sname, dname);
1891  if (!entry)
1892  return -1;
1893 
1894  tr = load_transcoder_entry(entry);
1895  if (!tr) return -1;
1896 
1897  return rb_econv_add_transcoder_at(ec, tr, n);
1898 }
1899 
1900 static int
1901 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1902 {
1903  return rb_econv_add_converter(ec, "", decorator_name, n);
1904 }
1905 
1906 int
1907 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1908 {
1909  const rb_transcoder *tr;
1910 
1911  if (ec->num_trans == 0)
1912  return rb_econv_decorate_at(ec, decorator_name, 0);
1913 
1914  tr = ec->elems[0].tc->transcoder;
1915 
1916  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1917  tr->asciicompat_type == asciicompat_decoder)
1918  return rb_econv_decorate_at(ec, decorator_name, 1);
1919 
1920  return rb_econv_decorate_at(ec, decorator_name, 0);
1921 }
1922 
1923 int
1924 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1925 {
1926  const rb_transcoder *tr;
1927 
1928  if (ec->num_trans == 0)
1929  return rb_econv_decorate_at(ec, decorator_name, 0);
1930 
1931  tr = ec->elems[ec->num_trans-1].tc->transcoder;
1932 
1933  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1934  tr->asciicompat_type == asciicompat_encoder)
1935  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1936 
1937  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1938 }
1939 
1940 void
1942 {
1943  const char *dname = 0;
1944 
1945  switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1947  dname = "universal_newline";
1948  break;
1950  dname = "crlf_newline";
1951  break;
1953  dname = "cr_newline";
1954  break;
1955  }
1956 
1957  if (dname) {
1958  const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1959  int num_trans = ec->num_trans;
1960  int i, j = 0;
1961 
1962  for (i=0; i < num_trans; i++) {
1963  if (transcoder == ec->elems[i].tc->transcoder) {
1964  rb_transcoding_close(ec->elems[i].tc);
1965  xfree(ec->elems[i].out_buf_start);
1966  ec->num_trans--;
1967  }
1968  else
1969  ec->elems[j++] = ec->elems[i];
1970  }
1971  }
1972 
1974 }
1975 
1976 static VALUE
1977 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
1978 {
1979  int has_description = 0;
1980 
1981  if (NIL_P(mesg))
1982  mesg = rb_str_new(NULL, 0);
1983 
1984  if (*sname != '\0' || *dname != '\0') {
1985  if (*sname == '\0')
1986  rb_str_cat2(mesg, dname);
1987  else if (*dname == '\0')
1988  rb_str_cat2(mesg, sname);
1989  else
1990  rb_str_catf(mesg, "%s to %s", sname, dname);
1991  has_description = 1;
1992  }
1993 
1994  if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
1998  const char *pre = "";
1999  if (has_description)
2000  rb_str_cat2(mesg, " with ");
2001  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2002  rb_str_cat2(mesg, pre); pre = ",";
2003  rb_str_cat2(mesg, "universal_newline");
2004  }
2005  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2006  rb_str_cat2(mesg, pre); pre = ",";
2007  rb_str_cat2(mesg, "crlf_newline");
2008  }
2009  if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2010  rb_str_cat2(mesg, pre); pre = ",";
2011  rb_str_cat2(mesg, "cr_newline");
2012  }
2013  if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2014  rb_str_cat2(mesg, pre); pre = ",";
2015  rb_str_cat2(mesg, "xml_text");
2016  }
2017  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2018  rb_str_cat2(mesg, pre); pre = ",";
2019  rb_str_cat2(mesg, "xml_attr_content");
2020  }
2021  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2022  rb_str_cat2(mesg, pre); pre = ",";
2023  rb_str_cat2(mesg, "xml_attr_quote");
2024  }
2025  has_description = 1;
2026  }
2027  if (!has_description) {
2028  rb_str_cat2(mesg, "no-conversion");
2029  }
2030 
2031  return mesg;
2032 }
2033 
2034 VALUE
2035 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2036 {
2037  VALUE mesg, exc;
2038  mesg = rb_str_new_cstr("code converter not found (");
2039  econv_description(sname, dname, ecflags, mesg);
2040  rb_str_cat2(mesg, ")");
2042  return exc;
2043 }
2044 
2045 static VALUE
2047 {
2048  VALUE mesg, exc;
2051  const char *err = (const char *)ec->last_error.error_bytes_start;
2052  size_t error_len = ec->last_error.error_bytes_len;
2053  VALUE bytes = rb_str_new(err, error_len);
2054  VALUE dumped = rb_str_dump(bytes);
2055  size_t readagain_len = ec->last_error.readagain_len;
2056  VALUE bytes2 = Qnil;
2057  VALUE dumped2;
2058  int idx;
2060  mesg = rb_sprintf("incomplete %s on %s",
2061  StringValueCStr(dumped),
2063  }
2064  else if (readagain_len) {
2065  bytes2 = rb_str_new(err+error_len, readagain_len);
2066  dumped2 = rb_str_dump(bytes2);
2067  mesg = rb_sprintf("%s followed by %s on %s",
2068  StringValueCStr(dumped),
2069  StringValueCStr(dumped2),
2071  }
2072  else {
2073  mesg = rb_sprintf("%s on %s",
2074  StringValueCStr(dumped),
2076  }
2077 
2079  rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
2080  rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
2081  rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
2082 
2083  set_encs:
2084  rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
2085  rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
2087  if (0 <= idx)
2088  rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2090  if (0 <= idx)
2091  rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2092  return exc;
2093  }
2095  VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2097  VALUE dumped = Qnil;
2098  int idx;
2099  if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2100  rb_encoding *utf8 = rb_utf8_encoding();
2101  const char *start, *end;
2102  int n;
2103  start = (const char *)ec->last_error.error_bytes_start;
2104  end = start + ec->last_error.error_bytes_len;
2105  n = rb_enc_precise_mbclen(start, end, utf8);
2106  if (MBCLEN_CHARFOUND_P(n) &&
2107  (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2108  unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2109  dumped = rb_sprintf("U+%04X", cc);
2110  }
2111  }
2112  if (dumped == Qnil)
2113  dumped = rb_str_dump(bytes);
2114  if (strcmp(ec->last_error.source_encoding,
2115  ec->source_encoding_name) == 0 &&
2116  strcmp(ec->last_error.destination_encoding,
2117  ec->destination_encoding_name) == 0) {
2118  mesg = rb_sprintf("%s from %s to %s",
2119  StringValueCStr(dumped),
2122  }
2123  else {
2124  int i;
2125  mesg = rb_sprintf("%s to %s in conversion from %s",
2126  StringValueCStr(dumped),
2128  ec->source_encoding_name);
2129  for (i = 0; i < ec->num_trans; i++) {
2130  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2131  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2132  rb_str_catf(mesg, " to %s",
2133  ec->elems[i].tc->transcoder->dst_encoding);
2134  }
2135  }
2138  if (0 <= idx)
2139  rb_enc_associate_index(bytes, idx);
2140  rb_ivar_set(exc, rb_intern("error_char"), bytes);
2141  goto set_encs;
2142  }
2143  return Qnil;
2144 }
2145 
2146 static void
2148  VALUE destination,
2149  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2150  int max_output,
2151  unsigned char **out_start_ptr,
2152  unsigned char **out_pos,
2153  unsigned char **out_stop_ptr)
2154 {
2155  size_t len = (*out_pos - *out_start_ptr);
2156  size_t new_len = (len + max_output) * 2;
2157  *out_start_ptr = resize_destination(destination, len, new_len);
2158  *out_pos = *out_start_ptr + len;
2159  *out_stop_ptr = *out_start_ptr + new_len;
2160 }
2161 
2162 static int
2164 {
2165  rb_transcoding *tc;
2166  const rb_transcoder *tr;
2167  const unsigned char *replacement;
2168  const char *repl_enc;
2169  const char *ins_enc;
2170  size_t len;
2171 
2172  if (ec->replacement_str)
2173  return 0;
2174 
2175  ins_enc = rb_econv_encoding_to_insert_output(ec);
2176 
2177  tc = ec->last_tc;
2178  if (*ins_enc) {
2179  tr = tc->transcoder;
2180  rb_enc_find(tr->dst_encoding);
2181  replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2182  }
2183  else {
2184  replacement = (unsigned char *)"?";
2185  len = 1;
2186  repl_enc = "";
2187  }
2188 
2189  ec->replacement_str = replacement;
2190  ec->replacement_len = len;
2191  ec->replacement_enc = repl_enc;
2192  ec->replacement_allocated = 0;
2193  return 0;
2194 }
2195 
2196 int
2198  const unsigned char *str, size_t len, const char *encname)
2199 {
2200  unsigned char *str2;
2201  size_t len2;
2202  const char *encname2;
2203 
2204  encname2 = rb_econv_encoding_to_insert_output(ec);
2205 
2206  if (!*encname2 || encoding_equal(encname, encname2)) {
2207  str2 = xmalloc(len);
2208  MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2209  len2 = len;
2210  encname2 = encname;
2211  }
2212  else {
2213  str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2214  if (!str2)
2215  return -1;
2216  }
2217 
2218  if (ec->replacement_allocated) {
2219  xfree((void *)ec->replacement_str);
2220  }
2221  ec->replacement_allocated = 1;
2222  ec->replacement_str = str2;
2223  ec->replacement_len = len2;
2224  ec->replacement_enc = encname2;
2225  return 0;
2226 }
2227 
2228 static int
2230 {
2231  int ret;
2232 
2233  if (make_replacement(ec) == -1)
2234  return -1;
2235 
2237  if (ret == -1)
2238  return -1;
2239 
2240  return 0;
2241 }
2242 
2243 #if 1
2244 #define hash_fallback rb_hash_aref
2245 
2246 static VALUE
2248 {
2249  return rb_proc_call(fallback, rb_ary_new4(1, &c));
2250 }
2251 
2252 static VALUE
2254 {
2255  return rb_method_call(1, &c, fallback);
2256 }
2257 
2258 static VALUE
2260 {
2261  return rb_funcall3(fallback, sym_aref, 1, &c);
2262 }
2263 
2264 static void
2265 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2266  const unsigned char *in_stop, unsigned char *out_stop,
2267  VALUE destination,
2268  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2269  const char *src_encoding,
2270  const char *dst_encoding,
2271  int ecflags,
2272  VALUE ecopts)
2273 {
2274  rb_econv_t *ec;
2275  rb_transcoding *last_tc;
2276  rb_econv_result_t ret;
2277  unsigned char *out_start = *out_pos;
2278  int max_output;
2279  VALUE exc;
2280  VALUE fallback = Qnil;
2281  VALUE (*fallback_func)(VALUE, VALUE) = 0;
2282 
2283  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2284  if (!ec)
2285  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2286 
2287  if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2288  fallback = rb_hash_aref(ecopts, sym_fallback);
2289  if (RB_TYPE_P(fallback, T_HASH)) {
2290  fallback_func = hash_fallback;
2291  }
2292  else if (rb_obj_is_proc(fallback)) {
2293  fallback_func = proc_fallback;
2294  }
2295  else if (rb_obj_is_method(fallback)) {
2296  fallback_func = method_fallback;
2297  }
2298  else {
2299  fallback_func = aref_fallback;
2300  }
2301  }
2302  last_tc = ec->last_tc;
2303  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2304 
2305  resume:
2306  ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2307 
2308  if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2309  VALUE rep = rb_enc_str_new(
2310  (const char *)ec->last_error.error_bytes_start,
2313  rep = (*fallback_func)(fallback, rep);
2314  if (rep != Qundef && !NIL_P(rep)) {
2315  StringValue(rep);
2316  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2317  RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2318  if ((int)ret == -1) {
2319  rb_raise(rb_eArgError, "too big fallback string");
2320  }
2321  goto resume;
2322  }
2323  }
2324 
2325  if (ret == econv_invalid_byte_sequence ||
2326  ret == econv_incomplete_input ||
2327  ret == econv_undefined_conversion) {
2328  exc = make_econv_exception(ec);
2329  rb_econv_close(ec);
2330  rb_exc_raise(exc);
2331  }
2332 
2333  if (ret == econv_destination_buffer_full) {
2334  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2335  goto resume;
2336  }
2337 
2338  rb_econv_close(ec);
2339  return;
2340 }
2341 #else
2342 /* sample transcode_loop implementation in byte-by-byte stream style */
2343 static void
2344 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2345  const unsigned char *in_stop, unsigned char *out_stop,
2346  VALUE destination,
2347  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2348  const char *src_encoding,
2349  const char *dst_encoding,
2350  int ecflags,
2351  VALUE ecopts)
2352 {
2353  rb_econv_t *ec;
2354  rb_transcoding *last_tc;
2355  rb_econv_result_t ret;
2356  unsigned char *out_start = *out_pos;
2357  const unsigned char *ptr;
2358  int max_output;
2359  VALUE exc;
2360 
2361  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2362  if (!ec)
2363  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2364 
2365  last_tc = ec->last_tc;
2366  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2367 
2369  ptr = *in_pos;
2370  while (ret != econv_finished) {
2371  unsigned char input_byte;
2372  const unsigned char *p = &input_byte;
2373 
2374  if (ret == econv_source_buffer_empty) {
2375  if (ptr < in_stop) {
2376  input_byte = *ptr;
2377  ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2378  }
2379  else {
2380  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2381  }
2382  }
2383  else {
2384  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2385  }
2386  if (&input_byte != p)
2387  ptr += p - &input_byte;
2388  switch (ret) {
2392  exc = make_econv_exception(ec);
2393  rb_econv_close(ec);
2394  rb_exc_raise(exc);
2395  break;
2396 
2398  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2399  break;
2400 
2402  break;
2403 
2404  case econv_finished:
2405  break;
2406  }
2407  }
2408  rb_econv_close(ec);
2409  *in_pos = in_stop;
2410  return;
2411 }
2412 #endif
2413 
2414 
2415 /*
2416  * String-specific code
2417  */
2418 
2419 static unsigned char *
2420 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2421 {
2422  rb_str_resize(destination, new_len);
2423  return (unsigned char *)RSTRING_PTR(destination);
2424 }
2425 
2426 static int
2427 econv_opts(VALUE opt, int ecflags)
2428 {
2429  VALUE v;
2430 
2431  v = rb_hash_aref(opt, sym_invalid);
2432  if (NIL_P(v)) {
2433  }
2434  else if (v==sym_replace) {
2435  ecflags |= ECONV_INVALID_REPLACE;
2436  }
2437  else {
2438  rb_raise(rb_eArgError, "unknown value for invalid character option");
2439  }
2440 
2441  v = rb_hash_aref(opt, sym_undef);
2442  if (NIL_P(v)) {
2443  }
2444  else if (v==sym_replace) {
2445  ecflags |= ECONV_UNDEF_REPLACE;
2446  }
2447  else {
2448  rb_raise(rb_eArgError, "unknown value for undefined character option");
2449  }
2450 
2451  v = rb_hash_aref(opt, sym_replace);
2452  if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2453  ecflags |= ECONV_UNDEF_REPLACE;
2454  }
2455 
2456  v = rb_hash_aref(opt, sym_xml);
2457  if (!NIL_P(v)) {
2458  if (v==sym_text) {
2460  }
2461  else if (v==sym_attr) {
2463  }
2464  else if (RB_TYPE_P(v, T_SYMBOL)) {
2465  rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
2466  }
2467  else {
2468  rb_raise(rb_eArgError, "unexpected value for xml option");
2469  }
2470  }
2471 
2472 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2473  v = rb_hash_aref(opt, sym_newline);
2474  if (!NIL_P(v)) {
2475  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2476  if (v == sym_universal) {
2478  }
2479  else if (v == sym_crlf) {
2480  ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2481  }
2482  else if (v == sym_cr) {
2483  ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2484  }
2485  else if (v == sym_lf) {
2486  /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2487  }
2488  else if (SYMBOL_P(v)) {
2489  rb_raise(rb_eArgError, "unexpected value for newline option: %s",
2490  rb_id2name(SYM2ID(v)));
2491  }
2492  else {
2493  rb_raise(rb_eArgError, "unexpected value for newline option");
2494  }
2495  }
2496  else
2497 #endif
2498  {
2499  int setflags = 0, newlineflag = 0;
2500 
2502  if (RTEST(v))
2504  newlineflag |= !NIL_P(v);
2505 
2506  v = rb_hash_aref(opt, sym_crlf_newline);
2507  if (RTEST(v))
2508  setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2509  newlineflag |= !NIL_P(v);
2510 
2511  v = rb_hash_aref(opt, sym_cr_newline);
2512  if (RTEST(v))
2513  setflags |= ECONV_CR_NEWLINE_DECORATOR;
2514  newlineflag |= !NIL_P(v);
2515 
2516  if (newlineflag) {
2517  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2518  ecflags |= setflags;
2519  }
2520  }
2521 
2522  return ecflags;
2523 }
2524 
2525 int
2526 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2527 {
2528  VALUE newhash = Qnil;
2529  VALUE v;
2530 
2531  if (NIL_P(opthash)) {
2532  *opts = Qnil;
2533  return ecflags;
2534  }
2535  ecflags = econv_opts(opthash, ecflags);
2536 
2537  v = rb_hash_aref(opthash, sym_replace);
2538  if (!NIL_P(v)) {
2539  StringValue(v);
2541  VALUE dumped = rb_str_dump(v);
2542  rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2543  StringValueCStr(dumped),
2544  rb_enc_name(rb_enc_get(v)));
2545  }
2546  v = rb_str_new_frozen(v);
2547  newhash = rb_hash_new();
2548  rb_hash_aset(newhash, sym_replace, v);
2549  }
2550 
2551  v = rb_hash_aref(opthash, sym_fallback);
2552  if (!NIL_P(v)) {
2553  VALUE h = rb_check_hash_type(v);
2554  if (NIL_P(h)
2556  : (v = h, 1)) {
2557  if (NIL_P(newhash))
2558  newhash = rb_hash_new();
2559  rb_hash_aset(newhash, sym_fallback, v);
2560  }
2561  }
2562 
2563  if (!NIL_P(newhash))
2564  rb_hash_freeze(newhash);
2565  *opts = newhash;
2566 
2567  return ecflags;
2568 }
2569 
2570 int
2572 {
2573  return rb_econv_prepare_options(opthash, opts, 0);
2574 }
2575 
2576 rb_econv_t *
2577 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2578 {
2579  rb_econv_t *ec;
2580  VALUE replacement;
2581 
2582  if (NIL_P(opthash)) {
2583  replacement = Qnil;
2584  }
2585  else {
2586  if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2587  rb_bug("rb_econv_open_opts called with invalid opthash");
2588  replacement = rb_hash_aref(opthash, sym_replace);
2589  }
2590 
2591  ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2592  if (!ec)
2593  return ec;
2594 
2595  if (!NIL_P(replacement)) {
2596  int ret;
2597  rb_encoding *enc = rb_enc_get(replacement);
2598 
2599  ret = rb_econv_set_replacement(ec,
2600  (const unsigned char *)RSTRING_PTR(replacement),
2601  RSTRING_LEN(replacement),
2602  rb_enc_name(enc));
2603  if (ret == -1) {
2604  rb_econv_close(ec);
2605  return NULL;
2606  }
2607  }
2608  return ec;
2609 }
2610 
2611 static int
2612 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
2613 {
2614  rb_encoding *enc;
2615  const char *n;
2616  int encidx;
2617  VALUE encval;
2618 
2619  if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2620  !(enc = rb_enc_from_index(encidx))) {
2621  enc = NULL;
2622  encidx = 0;
2623  n = StringValueCStr(*arg);
2624  }
2625  else {
2626  n = rb_enc_name(enc);
2627  }
2628 
2629  *name_p = n;
2630  *enc_p = enc;
2631 
2632  return encidx;
2633 }
2634 
2635 static int
2636 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
2637  const char **sname_p, rb_encoding **senc_p,
2638  const char **dname_p, rb_encoding **denc_p)
2639 {
2640  rb_encoding *senc, *denc;
2641  const char *sname, *dname;
2642  int sencidx, dencidx;
2643 
2644  dencidx = enc_arg(arg1, &dname, &denc);
2645 
2646  if (NIL_P(*arg2)) {
2647  sencidx = rb_enc_get_index(str);
2648  senc = rb_enc_from_index(sencidx);
2649  sname = rb_enc_name(senc);
2650  }
2651  else {
2652  sencidx = enc_arg(arg2, &sname, &senc);
2653  }
2654 
2655  *sname_p = sname;
2656  *senc_p = senc;
2657  *dname_p = dname;
2658  *denc_p = denc;
2659  return dencidx;
2660 }
2661 
2662 static int
2663 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2664 {
2665  VALUE dest;
2666  VALUE str = *self;
2667  volatile VALUE arg1, arg2;
2668  long blen, slen;
2669  unsigned char *buf, *bp, *sp;
2670  const unsigned char *fromp;
2671  rb_encoding *senc, *denc;
2672  const char *sname, *dname;
2673  int dencidx;
2674  int explicitly_invalid_replace = TRUE;
2675 
2676  rb_check_arity(argc, 0, 2);
2677 
2678  if (argc == 0) {
2679  arg1 = rb_enc_default_internal();
2680  if (NIL_P(arg1)) {
2681  if (!ecflags) return -1;
2682  arg1 = rb_obj_encoding(str);
2683  }
2684  if (!(ecflags & ECONV_INVALID_MASK)) {
2685  explicitly_invalid_replace = FALSE;
2686  }
2688  }
2689  else {
2690  arg1 = argv[0];
2691  }
2692  arg2 = argc<=1 ? Qnil : argv[1];
2693  dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2694 
2695  if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2699  if (senc && senc == denc) {
2700  if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2701  VALUE rep = Qnil;
2702  if (!NIL_P(ecopts)) {
2703  rep = rb_hash_aref(ecopts, sym_replace);
2704  }
2705  dest = rb_str_scrub(str, rep);
2706  if (NIL_P(dest)) dest = str;
2707  *self = dest;
2708  return dencidx;
2709  }
2710  return NIL_P(arg2) ? -1 : dencidx;
2711  }
2712  if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2714  return dencidx;
2715  }
2716  }
2717  if (encoding_equal(sname, dname)) {
2718  return NIL_P(arg2) ? -1 : dencidx;
2719  }
2720  }
2721  else {
2722  if (encoding_equal(sname, dname)) {
2723  sname = "";
2724  dname = "";
2725  }
2726  }
2727 
2728  fromp = sp = (unsigned char *)RSTRING_PTR(str);
2729  slen = RSTRING_LEN(str);
2730  blen = slen + 30; /* len + margin */
2731  dest = rb_str_tmp_new(blen);
2732  bp = (unsigned char *)RSTRING_PTR(dest);
2733 
2734  transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2735  if (fromp != sp+slen) {
2736  rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2737  }
2738  buf = (unsigned char *)RSTRING_PTR(dest);
2739  *bp = '\0';
2740  rb_str_set_len(dest, bp - buf);
2741 
2742  /* set encoding */
2743  if (!denc) {
2744  dencidx = rb_define_dummy_encoding(dname);
2745  }
2746  *self = dest;
2747 
2748  return dencidx;
2749 }
2750 
2751 static int
2753 {
2754  VALUE opt;
2755  int ecflags = 0;
2756  VALUE ecopts = Qnil;
2757 
2758  argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2759  if (!NIL_P(opt)) {
2760  ecflags = rb_econv_prepare_opts(opt, &ecopts);
2761  }
2762  return str_transcode0(argc, argv, self, ecflags, ecopts);
2763 }
2764 
2765 static inline VALUE
2766 str_encode_associate(VALUE str, int encidx)
2767 {
2768  int cr = 0;
2769 
2770  rb_enc_associate_index(str, encidx);
2771 
2772  /* transcoded string never be broken. */
2773  if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2775  }
2776  else {
2777  cr = ENC_CODERANGE_VALID;
2778  }
2779  ENC_CODERANGE_SET(str, cr);
2780  return str;
2781 }
2782 
2783 /*
2784  * call-seq:
2785  * str.encode!(encoding [, options] ) -> str
2786  * str.encode!(dst_encoding, src_encoding [, options] ) -> str
2787  *
2788  * The first form transcodes the contents of <i>str</i> from
2789  * str.encoding to +encoding+.
2790  * The second form transcodes the contents of <i>str</i> from
2791  * src_encoding to dst_encoding.
2792  * The options Hash gives details for conversion. See String#encode
2793  * for details.
2794  * Returns the string even if no changes were made.
2795  */
2796 
2797 static VALUE
2799 {
2800  VALUE newstr;
2801  int encidx;
2802 
2803  rb_check_frozen(str);
2804 
2805  newstr = str;
2806  encidx = str_transcode(argc, argv, &newstr);
2807 
2808  if (encidx < 0) return str;
2809  if (newstr == str) {
2810  rb_enc_associate_index(str, encidx);
2811  return str;
2812  }
2813  rb_str_shared_replace(str, newstr);
2814  return str_encode_associate(str, encidx);
2815 }
2816 
2817 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2818 
2819 /*
2820  * call-seq:
2821  * str.encode(encoding [, options] ) -> str
2822  * str.encode(dst_encoding, src_encoding [, options] ) -> str
2823  * str.encode([options]) -> str
2824  *
2825  * The first form returns a copy of +str+ transcoded
2826  * to encoding +encoding+.
2827  * The second form returns a copy of +str+ transcoded
2828  * from src_encoding to dst_encoding.
2829  * The last form returns a copy of +str+ transcoded to
2830  * <tt>Encoding.default_internal</tt>.
2831  *
2832  * By default, the first and second form raise
2833  * Encoding::UndefinedConversionError for characters that are
2834  * undefined in the destination encoding, and
2835  * Encoding::InvalidByteSequenceError for invalid byte sequences
2836  * in the source encoding. The last form by default does not raise
2837  * exceptions but uses replacement strings.
2838  *
2839  * The +options+ Hash gives details for conversion and can have the following
2840  * keys:
2841  *
2842  * :invalid ::
2843  * If the value is +:replace+, #encode replaces invalid byte sequences in
2844  * +str+ with the replacement character. The default is to raise the
2845  * Encoding::InvalidByteSequenceError exception
2846  * :undef ::
2847  * If the value is +:replace+, #encode replaces characters which are
2848  * undefined in the destination encoding with the replacement character.
2849  * The default is to raise the Encoding::UndefinedConversionError.
2850  * :replace ::
2851  * Sets the replacement string to the given value. The default replacement
2852  * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2853  * :fallback ::
2854  * Sets the replacement string by the given object for undefined
2855  * character. The object should be a Hash, a Proc, a Method, or an
2856  * object which has [] method.
2857  * Its key is an undefined character encoded in the source encoding
2858  * of current transcoder. Its value can be any encoding until it
2859  * can be converted into the destination encoding of the transcoder.
2860  * :xml ::
2861  * The value must be +:text+ or +:attr+.
2862  * If the value is +:text+ #encode replaces undefined characters with their
2863  * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2864  * are converted to "&amp;", "&lt;", and "&gt;", respectively.
2865  * If the value is +:attr+, #encode also quotes the replacement result
2866  * (using '"'), and replaces '"' with "&quot;".
2867  * :cr_newline ::
2868  * Replaces LF ("\n") with CR ("\r") if value is true.
2869  * :crlf_newline ::
2870  * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2871  * :universal_newline ::
2872  * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2873  */
2874 
2875 static VALUE
2877 {
2878  VALUE newstr = str;
2879  int encidx = str_transcode(argc, argv, &newstr);
2880  return encoded_dup(newstr, str, encidx);
2881 }
2882 
2883 VALUE
2884 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2885 {
2886  int argc = 1;
2887  VALUE *argv = &to;
2888  VALUE newstr = str;
2889  int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2890  return encoded_dup(newstr, str, encidx);
2891 }
2892 
2893 static VALUE
2894 encoded_dup(VALUE newstr, VALUE str, int encidx)
2895 {
2896  if (encidx < 0) return rb_str_dup(str);
2897  if (newstr == str) {
2898  newstr = rb_str_dup(str);
2899  rb_enc_associate_index(newstr, encidx);
2900  return newstr;
2901  }
2902  else {
2903  RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2904  }
2905  return str_encode_associate(newstr, encidx);
2906 }
2907 
2908 static void
2909 econv_free(void *ptr)
2910 {
2911  rb_econv_t *ec = ptr;
2912  rb_econv_close(ec);
2913 }
2914 
2915 static size_t
2916 econv_memsize(const void *ptr)
2917 {
2918  return ptr ? sizeof(rb_econv_t) : 0;
2919 }
2920 
2922  "econv",
2925 };
2926 
2927 static VALUE
2929 {
2930  return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2931 }
2932 
2933 static rb_encoding *
2935 {
2936  rb_encoding *enc;
2937  int idx;
2939  enc = rb_enc_from_index(idx);
2940  return enc;
2941 }
2942 
2943 static rb_encoding *
2944 make_encoding(const char *name)
2945 {
2946  rb_encoding *enc;
2947  enc = rb_enc_find(name);
2948  if (!enc)
2949  enc = make_dummy_encoding(name);
2950  return enc;
2951 }
2952 
2953 static VALUE
2954 make_encobj(const char *name)
2955 {
2957 }
2958 
2959 /*
2960  * call-seq:
2961  * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2962  * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2963  *
2964  * Returns the corresponding ASCII compatible encoding.
2965  *
2966  * Returns nil if the argument is an ASCII compatible encoding.
2967  *
2968  * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
2969  * can represents exactly the same characters as the given ASCII incompatible encoding.
2970  * So, no conversion undefined error occurs when converting between the two encodings.
2971  *
2972  * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2973  * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2974  * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
2975  *
2976  */
2977 static VALUE
2979 {
2980  const char *arg_name, *result_name;
2981  rb_encoding *arg_enc, *result_enc;
2982 
2983  enc_arg(&arg, &arg_name, &arg_enc);
2984 
2985  result_name = rb_econv_asciicompat_encoding(arg_name);
2986 
2987  if (result_name == NULL)
2988  return Qnil;
2989 
2990  result_enc = make_encoding(result_name);
2991 
2992  return rb_enc_from_encoding(result_enc);
2993 }
2994 
2995 static void
2997  volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
2998  const char **sname_p, const char **dname_p,
2999  rb_encoding **senc_p, rb_encoding **denc_p,
3000  int *ecflags_p,
3001  VALUE *ecopts_p)
3002 {
3003  VALUE opt, flags_v, ecopts;
3004  int sidx, didx;
3005  const char *sname, *dname;
3006  rb_encoding *senc, *denc;
3007  int ecflags;
3008 
3009  argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3010 
3011  if (!NIL_P(flags_v)) {
3012  if (!NIL_P(opt)) {
3013  rb_error_arity(argc + 1, 2, 3);
3014  }
3015  ecflags = NUM2INT(rb_to_int(flags_v));
3016  ecopts = Qnil;
3017  }
3018  else if (!NIL_P(opt)) {
3019  ecflags = rb_econv_prepare_opts(opt, &ecopts);
3020  }
3021  else {
3022  ecflags = 0;
3023  ecopts = Qnil;
3024  }
3025 
3026  senc = NULL;
3027  sidx = rb_to_encoding_index(*snamev_p);
3028  if (0 <= sidx) {
3029  senc = rb_enc_from_index(sidx);
3030  }
3031  else {
3032  StringValue(*snamev_p);
3033  }
3034 
3035  denc = NULL;
3036  didx = rb_to_encoding_index(*dnamev_p);
3037  if (0 <= didx) {
3038  denc = rb_enc_from_index(didx);
3039  }
3040  else {
3041  StringValue(*dnamev_p);
3042  }
3043 
3044  sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3045  dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3046 
3047  *sname_p = sname;
3048  *dname_p = dname;
3049  *senc_p = senc;
3050  *denc_p = denc;
3051  *ecflags_p = ecflags;
3052  *ecopts_p = ecopts;
3053 }
3054 
3055 static int
3056 decorate_convpath(VALUE convpath, int ecflags)
3057 {
3058  int num_decorators;
3059  const char *decorators[MAX_ECFLAGS_DECORATORS];
3060  int i;
3061  int n, len;
3062 
3063  num_decorators = decorator_names(ecflags, decorators);
3064  if (num_decorators == -1)
3065  return -1;
3066 
3067  len = n = RARRAY_LENINT(convpath);
3068  if (n != 0) {
3069  VALUE pair = RARRAY_AREF(convpath, n-1);
3070  if (RB_TYPE_P(pair, T_ARRAY)) {
3071  const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3072  const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3073  transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3074  const rb_transcoder *tr = load_transcoder_entry(entry);
3075  if (!tr)
3076  return -1;
3077  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3078  tr->asciicompat_type == asciicompat_encoder) {
3079  n--;
3080  rb_ary_store(convpath, len + num_decorators - 1, pair);
3081  }
3082  }
3083  else {
3084  rb_ary_store(convpath, len + num_decorators - 1, pair);
3085  }
3086  }
3087 
3088  for (i = 0; i < num_decorators; i++)
3089  rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3090 
3091  return 0;
3092 }
3093 
3094 static void
3095 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3096 {
3097  VALUE *ary_p = arg;
3098  VALUE v;
3099 
3100  if (*ary_p == Qnil) {
3101  *ary_p = rb_ary_new();
3102  }
3103 
3104  if (DECORATOR_P(sname, dname)) {
3105  v = rb_str_new_cstr(dname);
3106  }
3107  else {
3108  v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3109  }
3110  rb_ary_store(*ary_p, depth, v);
3111 }
3112 
3113 /*
3114  * call-seq:
3115  * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3116  * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3117  *
3118  * Returns a conversion path.
3119  *
3120  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3121  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3122  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3123  *
3124  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3125  * or
3126  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3127  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3128  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3129  * # "universal_newline"]
3130  *
3131  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3132  * or
3133  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3134  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3135  * # "universal_newline",
3136  * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3137  */
3138 static VALUE
3140 {
3141  volatile VALUE snamev, dnamev;
3142  const char *sname, *dname;
3143  rb_encoding *senc, *denc;
3144  int ecflags;
3145  VALUE ecopts;
3146  VALUE convpath;
3147 
3148  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3149 
3150  convpath = Qnil;
3151  transcode_search_path(sname, dname, search_convpath_i, &convpath);
3152 
3153  if (NIL_P(convpath))
3154  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3155 
3156  if (decorate_convpath(convpath, ecflags) == -1)
3157  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3158 
3159  return convpath;
3160 }
3161 
3162 /*
3163  * Check the existence of a conversion path.
3164  * Returns the number of converters in the conversion path.
3165  * result: >=0:success -1:failure
3166  */
3167 int
3168 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3169 {
3170  VALUE convpath = Qnil;
3171  transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3172  &convpath);
3173  return RTEST(convpath);
3174 }
3175 
3178  int index;
3179  int ret;
3180 };
3181 
3182 static void
3183 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3184 {
3185  struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
3186  int ret;
3187 
3188  if (a->ret == -1)
3189  return;
3190 
3191  ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3192 
3193  a->ret = ret;
3194  return;
3195 }
3196 
3197 static rb_econv_t *
3199  const char **sname_p, const char **dname_p,
3200  rb_encoding **senc_p, rb_encoding**denc_p)
3201 {
3202  rb_econv_t *ec;
3203  long i;
3204  int ret, first=1;
3205  VALUE elt;
3206  rb_encoding *senc = 0, *denc = 0;
3207  const char *sname, *dname;
3208 
3209  ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3210  DATA_PTR(self) = ec;
3211 
3212  for (i = 0; i < RARRAY_LEN(convpath); i++) {
3213  volatile VALUE snamev, dnamev;
3214  VALUE pair;
3215  elt = rb_ary_entry(convpath, i);
3216  if (!NIL_P(pair = rb_check_array_type(elt))) {
3217  if (RARRAY_LEN(pair) != 2)
3218  rb_raise(rb_eArgError, "not a 2-element array in convpath");
3219  snamev = rb_ary_entry(pair, 0);
3220  enc_arg(&snamev, &sname, &senc);
3221  dnamev = rb_ary_entry(pair, 1);
3222  enc_arg(&dnamev, &dname, &denc);
3223  }
3224  else {
3225  sname = "";
3226  dname = StringValueCStr(elt);
3227  }
3228  if (DECORATOR_P(sname, dname)) {
3229  ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3230  if (ret == -1)
3231  rb_raise(rb_eArgError, "decoration failed: %s", dname);
3232  }
3233  else {
3234  int j = ec->num_trans;
3235  struct rb_econv_init_by_convpath_t arg;
3236  arg.ec = ec;
3237  arg.index = ec->num_trans;
3238  arg.ret = 0;
3239  ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3240  if (ret == -1 || arg.ret == -1)
3241  rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
3242  if (first) {
3243  first = 0;
3244  *senc_p = senc;
3245  *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3246  }
3247  *denc_p = denc;
3248  *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3249  }
3250  }
3251 
3252  if (first) {
3253  *senc_p = NULL;
3254  *denc_p = NULL;
3255  *sname_p = "";
3256  *dname_p = "";
3257  }
3258 
3259  ec->source_encoding_name = *sname_p;
3260  ec->destination_encoding_name = *dname_p;
3261 
3262  return ec;
3263 }
3264 
3265 /*
3266  * call-seq:
3267  * Encoding::Converter.new(source_encoding, destination_encoding)
3268  * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3269  * Encoding::Converter.new(convpath)
3270  *
3271  * possible options elements:
3272  * hash form:
3273  * :invalid => nil # raise error on invalid byte sequence (default)
3274  * :invalid => :replace # replace invalid byte sequence
3275  * :undef => nil # raise error on undefined conversion (default)
3276  * :undef => :replace # replace undefined conversion
3277  * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3278  * :newline => :universal # decorator for converting CRLF and CR to LF
3279  * :newline => :crlf # decorator for converting LF to CRLF
3280  * :newline => :cr # decorator for converting LF to CR
3281  * :universal_newline => true # decorator for converting CRLF and CR to LF
3282  * :crlf_newline => true # decorator for converting LF to CRLF
3283  * :cr_newline => true # decorator for converting LF to CR
3284  * :xml => :text # escape as XML CharData.
3285  * :xml => :attr # escape as XML AttValue
3286  * integer form:
3287  * Encoding::Converter::INVALID_REPLACE
3288  * Encoding::Converter::UNDEF_REPLACE
3289  * Encoding::Converter::UNDEF_HEX_CHARREF
3290  * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3291  * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3292  * Encoding::Converter::CR_NEWLINE_DECORATOR
3293  * Encoding::Converter::XML_TEXT_DECORATOR
3294  * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3295  * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3296  *
3297  * Encoding::Converter.new creates an instance of Encoding::Converter.
3298  *
3299  * Source_encoding and destination_encoding should be a string or
3300  * Encoding object.
3301  *
3302  * opt should be nil, a hash or an integer.
3303  *
3304  * convpath should be an array.
3305  * convpath may contain
3306  * - two-element arrays which contain encodings or encoding names, or
3307  * - strings representing decorator names.
3308  *
3309  * Encoding::Converter.new optionally takes an option.
3310  * The option should be a hash or an integer.
3311  * The option hash can contain :invalid => nil, etc.
3312  * The option integer should be logical-or of constants such as
3313  * Encoding::Converter::INVALID_REPLACE, etc.
3314  *
3315  * [:invalid => nil]
3316  * Raise error on invalid byte sequence. This is a default behavior.
3317  * [:invalid => :replace]
3318  * Replace invalid byte sequence by replacement string.
3319  * [:undef => nil]
3320  * Raise an error if a character in source_encoding is not defined in destination_encoding.
3321  * This is a default behavior.
3322  * [:undef => :replace]
3323  * Replace undefined character in destination_encoding with replacement string.
3324  * [:replace => string]
3325  * Specify the replacement string.
3326  * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3327  * [:universal_newline => true]
3328  * Convert CRLF and CR to LF.
3329  * [:crlf_newline => true]
3330  * Convert LF to CRLF.
3331  * [:cr_newline => true]
3332  * Convert LF to CR.
3333  * [:xml => :text]
3334  * Escape as XML CharData.
3335  * This form can be used as a HTML 4.0 #PCDATA.
3336  * - '&' -> '&amp;'
3337  * - '<' -> '&lt;'
3338  * - '>' -> '&gt;'
3339  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3340  * [:xml => :attr]
3341  * Escape as XML AttValue.
3342  * The converted result is quoted as "...".
3343  * This form can be used as a HTML 4.0 attribute value.
3344  * - '&' -> '&amp;'
3345  * - '<' -> '&lt;'
3346  * - '>' -> '&gt;'
3347  * - '"' -> '&quot;'
3348  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3349  *
3350  * Examples:
3351  * # UTF-16BE to UTF-8
3352  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3353  *
3354  * # Usually, decorators such as newline conversion are inserted last.
3355  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3356  * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3357  * # "universal_newline"]
3358  *
3359  * # But, if the last encoding is ASCII incompatible,
3360  * # decorators are inserted before the last conversion.
3361  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3362  * p ec.convpath #=> ["crlf_newline",
3363  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3364  *
3365  * # Conversion path can be specified directly.
3366  * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3367  * p ec.convpath #=> ["universal_newline",
3368  * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3369  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3370  */
3371 static VALUE
3373 {
3374  VALUE ecopts;
3375  volatile VALUE snamev, dnamev;
3376  const char *sname, *dname;
3377  rb_encoding *senc, *denc;
3378  rb_econv_t *ec;
3379  int ecflags;
3380  VALUE convpath;
3381 
3382  if (rb_check_typeddata(self, &econv_data_type)) {
3383  rb_raise(rb_eTypeError, "already initialized");
3384  }
3385 
3386  if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3387  ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3388  ecflags = 0;
3389  ecopts = Qnil;
3390  }
3391  else {
3392  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3393  ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3394  }
3395 
3396  if (!ec) {
3397  VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3398  RB_GC_GUARD(snamev);
3399  RB_GC_GUARD(dnamev);
3400  rb_exc_raise(exc);
3401  }
3402 
3403  if (!DECORATOR_P(sname, dname)) {
3404  if (!senc)
3405  senc = make_dummy_encoding(sname);
3406  if (!denc)
3407  denc = make_dummy_encoding(dname);
3408  RB_GC_GUARD(snamev);
3409  RB_GC_GUARD(dnamev);
3410  }
3411 
3412  ec->source_encoding = senc;
3413  ec->destination_encoding = denc;
3414 
3415  DATA_PTR(self) = ec;
3416 
3417  return self;
3418 }
3419 
3420 /*
3421  * call-seq:
3422  * ec.inspect -> string
3423  *
3424  * Returns a printable version of <i>ec</i>
3425  *
3426  * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3427  * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3428  *
3429  */
3430 static VALUE
3432 {
3433  const char *cname = rb_obj_classname(self);
3434  rb_econv_t *ec;
3435 
3437  if (!ec)
3438  return rb_sprintf("#<%s: uninitialized>", cname);
3439  else {
3440  const char *sname = ec->source_encoding_name;
3441  const char *dname = ec->destination_encoding_name;
3442  VALUE str;
3443  str = rb_sprintf("#<%s: ", cname);
3444  econv_description(sname, dname, ec->flags, str);
3445  rb_str_cat2(str, ">");
3446  return str;
3447  }
3448 }
3449 
3450 static rb_econv_t *
3452 {
3453  rb_econv_t *ec;
3454 
3456  if (!ec) {
3457  rb_raise(rb_eTypeError, "uninitialized encoding converter");
3458  }
3459  return ec;
3460 }
3461 
3462 /*
3463  * call-seq:
3464  * ec.source_encoding -> encoding
3465  *
3466  * Returns the source encoding as an Encoding object.
3467  */
3468 static VALUE
3470 {
3471  rb_econv_t *ec = check_econv(self);
3472  if (!ec->source_encoding)
3473  return Qnil;
3475 }
3476 
3477 /*
3478  * call-seq:
3479  * ec.destination_encoding -> encoding
3480  *
3481  * Returns the destination encoding as an Encoding object.
3482  */
3483 static VALUE
3485 {
3486  rb_econv_t *ec = check_econv(self);
3487  if (!ec->destination_encoding)
3488  return Qnil;
3490 }
3491 
3492 /*
3493  * call-seq:
3494  * ec.convpath -> ary
3495  *
3496  * Returns the conversion path of ec.
3497  *
3498  * The result is an array of conversions.
3499  *
3500  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3501  * p ec.convpath
3502  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3503  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3504  * # "crlf_newline"]
3505  *
3506  * Each element of the array is a pair of encodings or a string.
3507  * A pair means an encoding conversion.
3508  * A string means a decorator.
3509  *
3510  * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3511  * a converter from ISO-8859-1 to UTF-8.
3512  * "crlf_newline" means newline converter from LF to CRLF.
3513  */
3514 static VALUE
3516 {
3517  rb_econv_t *ec = check_econv(self);
3518  VALUE result;
3519  int i;
3520 
3521  result = rb_ary_new();
3522  for (i = 0; i < ec->num_trans; i++) {
3523  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3524  VALUE v;
3525  if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3526  v = rb_str_new_cstr(tr->dst_encoding);
3527  else
3528  v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3529  rb_ary_push(result, v);
3530  }
3531  return result;
3532 }
3533 
3534 /*
3535  * call-seq:
3536  * ec == other -> true or false
3537  */
3538 static VALUE
3540 {
3541  rb_econv_t *ec1 = check_econv(self);
3542  rb_econv_t *ec2;
3543  int i;
3544 
3545  if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3546  return Qnil;
3547  }
3548  ec2 = DATA_PTR(other);
3549  if (!ec2) return Qfalse;
3550  if (ec1->source_encoding_name != ec2->source_encoding_name &&
3551  strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3552  return Qfalse;
3555  return Qfalse;
3556  if (ec1->flags != ec2->flags) return Qfalse;
3557  if (ec1->replacement_enc != ec2->replacement_enc &&
3558  strcmp(ec1->replacement_enc, ec2->replacement_enc))
3559  return Qfalse;
3560  if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3561  if (ec1->replacement_str != ec2->replacement_str &&
3563  return Qfalse;
3564 
3565  if (ec1->num_trans != ec2->num_trans) return Qfalse;
3566  for (i = 0; i < ec1->num_trans; i++) {
3567  if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3568  return Qfalse;
3569  }
3570  return Qtrue;
3571 }
3572 
3573 static VALUE
3575 {
3576  switch (res) {
3582  case econv_finished: return sym_finished;
3583  case econv_after_output: return sym_after_output;
3584  default: return INT2NUM(res); /* should not be reached */
3585  }
3586 }
3587 
3588 /*
3589  * call-seq:
3590  * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3591  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3592  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3593  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3594  *
3595  * possible opt elements:
3596  * hash form:
3597  * :partial_input => true # source buffer may be part of larger source
3598  * :after_output => true # stop conversion after output before input
3599  * integer form:
3600  * Encoding::Converter::PARTIAL_INPUT
3601  * Encoding::Converter::AFTER_OUTPUT
3602  *
3603  * possible results:
3604  * :invalid_byte_sequence
3605  * :incomplete_input
3606  * :undefined_conversion
3607  * :after_output
3608  * :destination_buffer_full
3609  * :source_buffer_empty
3610  * :finished
3611  *
3612  * primitive_convert converts source_buffer into destination_buffer.
3613  *
3614  * source_buffer should be a string or nil.
3615  * nil means an empty string.
3616  *
3617  * destination_buffer should be a string.
3618  *
3619  * destination_byteoffset should be an integer or nil.
3620  * nil means the end of destination_buffer.
3621  * If it is omitted, nil is assumed.
3622  *
3623  * destination_bytesize should be an integer or nil.
3624  * nil means unlimited.
3625  * If it is omitted, nil is assumed.
3626  *
3627  * opt should be nil, a hash or an integer.
3628  * nil means no flags.
3629  * If it is omitted, nil is assumed.
3630  *
3631  * primitive_convert converts the content of source_buffer from beginning
3632  * and store the result into destination_buffer.
3633  *
3634  * destination_byteoffset and destination_bytesize specify the region which
3635  * the converted result is stored.
3636  * destination_byteoffset specifies the start position in destination_buffer in bytes.
3637  * If destination_byteoffset is nil,
3638  * destination_buffer.bytesize is used for appending the result.
3639  * destination_bytesize specifies maximum number of bytes.
3640  * If destination_bytesize is nil,
3641  * destination size is unlimited.
3642  * After conversion, destination_buffer is resized to
3643  * destination_byteoffset + actually produced number of bytes.
3644  * Also destination_buffer's encoding is set to destination_encoding.
3645  *
3646  * primitive_convert drops the converted part of source_buffer.
3647  * the dropped part is converted in destination_buffer or
3648  * buffered in Encoding::Converter object.
3649  *
3650  * primitive_convert stops conversion when one of following condition met.
3651  * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3652  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3653  * - unexpected end of source buffer (:incomplete_input)
3654  * this occur only when :partial_input is not specified.
3655  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3656  * - character not representable in output encoding (:undefined_conversion)
3657  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3658  * - after some output is generated, before input is done (:after_output)
3659  * this occur only when :after_output is specified.
3660  * - destination buffer is full (:destination_buffer_full)
3661  * this occur only when destination_bytesize is non-nil.
3662  * - source buffer is empty (:source_buffer_empty)
3663  * this occur only when :partial_input is specified.
3664  * - conversion is finished (:finished)
3665  *
3666  * example:
3667  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3668  * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3669  * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3670  *
3671  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3672  * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3673  * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3674  * ret = ec.primitive_convert(src, dst="", nil, 1)
3675  * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3676  * ret = ec.primitive_convert(src, dst="", nil, 1)
3677  * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3678  * ret = ec.primitive_convert(src, dst="", nil, 1)
3679  * p [ret, src, dst] #=> [:finished, "", "i"]
3680  *
3681  */
3682 static VALUE
3684 {
3685  VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3686  rb_econv_t *ec = check_econv(self);
3687  rb_econv_result_t res;
3688  const unsigned char *ip, *is;
3689  unsigned char *op, *os;
3690  long output_byteoffset, output_bytesize;
3691  unsigned long output_byteend;
3692  int flags;
3693 
3694  argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3695 
3696  if (NIL_P(output_byteoffset_v))
3697  output_byteoffset = 0; /* dummy */
3698  else
3699  output_byteoffset = NUM2LONG(output_byteoffset_v);
3700 
3701  if (NIL_P(output_bytesize_v))
3702  output_bytesize = 0; /* dummy */
3703  else
3704  output_bytesize = NUM2LONG(output_bytesize_v);
3705 
3706  if (!NIL_P(flags_v)) {
3707  if (!NIL_P(opt)) {
3708  rb_error_arity(argc + 1, 2, 5);
3709  }
3710  flags = NUM2INT(rb_to_int(flags_v));
3711  }
3712  else if (!NIL_P(opt)) {
3713  VALUE v;
3714  flags = 0;
3715  v = rb_hash_aref(opt, sym_partial_input);
3716  if (RTEST(v))
3717  flags |= ECONV_PARTIAL_INPUT;
3718  v = rb_hash_aref(opt, sym_after_output);
3719  if (RTEST(v))
3720  flags |= ECONV_AFTER_OUTPUT;
3721  }
3722  else {
3723  flags = 0;
3724  }
3725 
3727  if (!NIL_P(input))
3728  StringValue(input);
3730 
3731  if (NIL_P(output_bytesize_v)) {
3732  output_bytesize = RSTRING_EMBED_LEN_MAX;
3733  if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3734  output_bytesize = RSTRING_LEN(input);
3735  }
3736 
3737  retry:
3738 
3739  if (NIL_P(output_byteoffset_v))
3740  output_byteoffset = RSTRING_LEN(output);
3741 
3742  if (output_byteoffset < 0)
3743  rb_raise(rb_eArgError, "negative output_byteoffset");
3744 
3745  if (RSTRING_LEN(output) < output_byteoffset)
3746  rb_raise(rb_eArgError, "output_byteoffset too big");
3747 
3748  if (output_bytesize < 0)
3749  rb_raise(rb_eArgError, "negative output_bytesize");
3750 
3751  output_byteend = (unsigned long)output_byteoffset +
3752  (unsigned long)output_bytesize;
3753 
3754  if (output_byteend < (unsigned long)output_byteoffset ||
3755  LONG_MAX < output_byteend)
3756  rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3757 
3758  if (rb_str_capacity(output) < output_byteend)
3759  rb_str_resize(output, output_byteend);
3760 
3761  if (NIL_P(input)) {
3762  ip = is = NULL;
3763  }
3764  else {
3765  ip = (const unsigned char *)RSTRING_PTR(input);
3766  is = ip + RSTRING_LEN(input);
3767  }
3768 
3769  op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3770  os = op + output_bytesize;
3771 
3772  res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3773  rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3774  if (!NIL_P(input)) {
3776  rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3777  }
3778 
3779  if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3780  if (LONG_MAX / 2 < output_bytesize)
3781  rb_raise(rb_eArgError, "too long conversion result");
3782  output_bytesize *= 2;
3783  output_byteoffset_v = Qnil;
3784  goto retry;
3785  }
3786 
3787  if (ec->destination_encoding) {
3789  }
3790 
3791  return econv_result_to_symbol(res);
3792 }
3793 
3794 /*
3795  * call-seq:
3796  * ec.convert(source_string) -> destination_string
3797  *
3798  * Convert source_string and return destination_string.
3799  *
3800  * source_string is assumed as a part of source.
3801  * i.e. :partial_input=>true is specified internally.
3802  * finish method should be used last.
3803  *
3804  * ec = Encoding::Converter.new("utf-8", "euc-jp")
3805  * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3806  * puts ec.finish.dump #=> ""
3807  *
3808  * ec = Encoding::Converter.new("euc-jp", "utf-8")
3809  * puts ec.convert("\xA4").dump #=> ""
3810  * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3811  * puts ec.finish.dump #=> ""
3812  *
3813  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3814  * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3815  * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3816  * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3817  * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3818  *
3819  * If a conversion error occur,
3820  * Encoding::UndefinedConversionError or
3821  * Encoding::InvalidByteSequenceError is raised.
3822  * Encoding::Converter#convert doesn't supply methods to recover or restart
3823  * from these exceptions.
3824  * When you want to handle these conversion errors,
3825  * use Encoding::Converter#primitive_convert.
3826  *
3827  */
3828 static VALUE
3829 econv_convert(VALUE self, VALUE source_string)
3830 {
3831  VALUE ret, dst;
3832  VALUE av[5];
3833  int ac;
3834  rb_econv_t *ec = check_econv(self);
3835 
3836  StringValue(source_string);
3837 
3838  dst = rb_str_new(NULL, 0);
3839 
3840  av[0] = rb_str_dup(source_string);
3841  av[1] = dst;
3842  av[2] = Qnil;
3843  av[3] = Qnil;
3844  av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
3845  ac = 5;
3846 
3847  ret = econv_primitive_convert(ac, av, self);
3848 
3849  if (ret == sym_invalid_byte_sequence ||
3852  VALUE exc = make_econv_exception(ec);
3853  rb_exc_raise(exc);
3854  }
3855 
3856  if (ret == sym_finished) {
3857  rb_raise(rb_eArgError, "converter already finished");
3858  }
3859 
3860  if (ret != sym_source_buffer_empty) {
3861  rb_bug("unexpected result of econv_primitive_convert");
3862  }
3863 
3864  return dst;
3865 }
3866 
3867 /*
3868  * call-seq:
3869  * ec.finish -> string
3870  *
3871  * Finishes the converter.
3872  * It returns the last part of the converted string.
3873  *
3874  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3875  * p ec.convert("\u3042") #=> "\e$B$\""
3876  * p ec.finish #=> "\e(B"
3877  */
3878 static VALUE
3880 {
3881  VALUE ret, dst;
3882  VALUE av[5];
3883  int ac;
3884  rb_econv_t *ec = check_econv(self);
3885 
3886  dst = rb_str_new(NULL, 0);
3887 
3888  av[0] = Qnil;
3889  av[1] = dst;
3890  av[2] = Qnil;
3891  av[3] = Qnil;
3892  av[4] = INT2FIX(0);
3893  ac = 5;
3894 
3895  ret = econv_primitive_convert(ac, av, self);
3896 
3897  if (ret == sym_invalid_byte_sequence ||
3900  VALUE exc = make_econv_exception(ec);
3901  rb_exc_raise(exc);
3902  }
3903 
3904  if (ret != sym_finished) {
3905  rb_bug("unexpected result of econv_primitive_convert");
3906  }
3907 
3908  return dst;
3909 }
3910 
3911 /*
3912  * call-seq:
3913  * ec.primitive_errinfo -> array
3914  *
3915  * primitive_errinfo returns important information regarding the last error
3916  * as a 5-element array:
3917  *
3918  * [result, enc1, enc2, error_bytes, readagain_bytes]
3919  *
3920  * result is the last result of primitive_convert.
3921  *
3922  * Other elements are only meaningful when result is
3923  * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3924  *
3925  * enc1 and enc2 indicate a conversion step as a pair of strings.
3926  * For example, a converter from EUC-JP to ISO-8859-1 converts
3927  * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3928  * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3929  *
3930  * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3931  * error_bytes is discarded portion.
3932  * readagain_bytes is buffered portion which is read again on next conversion.
3933  *
3934  * Example:
3935  *
3936  * # \xff is invalid as EUC-JP.
3937  * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3938  * ec.primitive_convert(src="\xff", dst="", nil, 10)
3939  * p ec.primitive_errinfo
3940  * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
3941  *
3942  * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3943  * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3944  * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3945  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3946  * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3947  * p ec.primitive_errinfo
3948  * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3949  *
3950  * # partial character is invalid
3951  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3952  * ec.primitive_convert(src="\xa4", dst="", nil, 10)
3953  * p ec.primitive_errinfo
3954  * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3955  *
3956  * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
3957  * # partial characters.
3958  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3959  * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
3960  * p ec.primitive_errinfo
3961  * #=> [:source_buffer_empty, nil, nil, nil, nil]
3962  *
3963  * # \xd8\x00\x00@ is invalid as UTF-16BE because
3964  * # no low surrogate after high surrogate (\xd8\x00).
3965  * # It is detected by 3rd byte (\00) which is part of next character.
3966  * # So the high surrogate (\xd8\x00) is discarded and
3967  * # the 3rd byte is read again later.
3968  * # Since the byte is buffered in ec, it is dropped from src.
3969  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3970  * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
3971  * p ec.primitive_errinfo
3972  * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
3973  * p src
3974  * #=> "@"
3975  *
3976  * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
3977  * # The problem is detected by 4th byte.
3978  * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
3979  * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
3980  * p ec.primitive_errinfo
3981  * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
3982  * p src
3983  * #=> ""
3984  *
3985  */
3986 static VALUE
3988 {
3989  rb_econv_t *ec = check_econv(self);
3990 
3991  VALUE ary;
3992 
3993  ary = rb_ary_new2(5);
3994 
3996  rb_ary_store(ary, 4, Qnil);
3997 
4000 
4003 
4007  }
4008 
4009  return ary;
4010 }
4011 
4012 /*
4013  * call-seq:
4014  * ec.insert_output(string) -> nil
4015  *
4016  * Inserts string into the encoding converter.
4017  * The string will be converted to the destination encoding and
4018  * output on later conversions.
4019  *
4020  * If the destination encoding is stateful,
4021  * string is converted according to the state and the state is updated.
4022  *
4023  * This method should be used only when a conversion error occurs.
4024  *
4025  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4026  * src = "HIRAGANA LETTER A is \u{3042}."
4027  * dst = ""
4028  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4029  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4030  * ec.insert_output("<err>")
4031  * p ec.primitive_convert(src, dst) #=> :finished
4032  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4033  *
4034  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4035  * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4036  * dst = ""
4037  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4038  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4039  * ec.insert_output "?" # state change required to output "?".
4040  * p ec.primitive_convert(src, dst) #=> :finished
4041  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4042  *
4043  */
4044 static VALUE
4046 {
4047  const char *insert_enc;
4048 
4049  int ret;
4050 
4051  rb_econv_t *ec = check_econv(self);
4052 
4053  StringValue(string);
4054  insert_enc = rb_econv_encoding_to_insert_output(ec);
4055  string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4056 
4057  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4058  if (ret == -1) {
4059  rb_raise(rb_eArgError, "too big string");
4060  }
4061 
4062  return Qnil;
4063 }
4064 
4065 /*
4066  * call-seq
4067  * ec.putback -> string
4068  * ec.putback(max_numbytes) -> string
4069  *
4070  * Put back the bytes which will be converted.
4071  *
4072  * The bytes are caused by invalid_byte_sequence error.
4073  * When invalid_byte_sequence error, some bytes are discarded and
4074  * some bytes are buffered to be converted later.
4075  * The latter bytes can be put back.
4076  * It can be observed by
4077  * Encoding::InvalidByteSequenceError#readagain_bytes and
4078  * Encoding::Converter#primitive_errinfo.
4079  *
4080  * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4081  * src = "\x00\xd8\x61\x00"
4082  * dst = ""
4083  * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4084  * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4085  * p ec.putback #=> "a\x00"
4086  * p ec.putback #=> "" # no more bytes to put back
4087  *
4088  */
4089 static VALUE
4091 {
4092  rb_econv_t *ec = check_econv(self);
4093  int n;
4094  int putbackable;
4095  VALUE str, max;
4096 
4097  rb_scan_args(argc, argv, "01", &max);
4098 
4099  if (NIL_P(max))
4100  n = rb_econv_putbackable(ec);
4101  else {
4102  n = NUM2INT(max);
4103  putbackable = rb_econv_putbackable(ec);
4104  if (putbackable < n)
4105  n = putbackable;
4106  }
4107 
4108  str = rb_str_new(NULL, n);
4109  rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4110 
4111  if (ec->source_encoding) {
4113  }
4114 
4115  return str;
4116 }
4117 
4118 /*
4119  * call-seq:
4120  * ec.last_error -> exception or nil
4121  *
4122  * Returns an exception object for the last conversion.
4123  * Returns nil if the last conversion did not produce an error.
4124  *
4125  * "error" means that
4126  * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4127  * Encoding::Converter#convert and
4128  * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4129  * Encoding::Converter#primitive_convert.
4130  *
4131  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4132  * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4133  * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4134  * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4135  * p ec.last_error #=> nil
4136  *
4137  */
4138 static VALUE
4140 {
4141  rb_econv_t *ec = check_econv(self);
4142  VALUE exc;
4143 
4144  exc = make_econv_exception(ec);
4145  if (NIL_P(exc))
4146  return Qnil;
4147  return exc;
4148 }
4149 
4150 /*
4151  * call-seq:
4152  * ec.replacement -> string
4153  *
4154  * Returns the replacement string.
4155  *
4156  * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4157  * p ec.replacement #=> "?"
4158  *
4159  * ec = Encoding::Converter.new("euc-jp", "utf-8")
4160  * p ec.replacement #=> "\uFFFD"
4161  */
4162 static VALUE
4164 {
4165  rb_econv_t *ec = check_econv(self);
4166  int ret;
4167  rb_encoding *enc;
4168 
4169  ret = make_replacement(ec);
4170  if (ret == -1) {
4171  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4172  }
4173 
4174  enc = rb_enc_find(ec->replacement_enc);
4175  return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4176 }
4177 
4178 /*
4179  * call-seq:
4180  * ec.replacement = string
4181  *
4182  * Sets the replacement string.
4183  *
4184  * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4185  * ec.replacement = "<undef>"
4186  * p ec.convert("a \u3042 b") #=> "a <undef> b"
4187  */
4188 static VALUE
4190 {
4191  rb_econv_t *ec = check_econv(self);
4192  VALUE string = arg;
4193  int ret;
4194  rb_encoding *enc;
4195 
4196  StringValue(string);
4197  enc = rb_enc_get(string);
4198 
4200  (const unsigned char *)RSTRING_PTR(string),
4201  RSTRING_LEN(string),
4202  rb_enc_name(enc));
4203 
4204  if (ret == -1) {
4205  /* xxx: rb_eInvalidByteSequenceError? */
4206  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4207  }
4208 
4209  return arg;
4210 }
4211 
4212 VALUE
4214 {
4215  return make_econv_exception(ec);
4216 }
4217 
4218 void
4220 {
4221  VALUE exc;
4222 
4223  exc = make_econv_exception(ec);
4224  if (NIL_P(exc))
4225  return;
4226  rb_exc_raise(exc);
4227 }
4228 
4229 /*
4230  * call-seq:
4231  * ecerr.source_encoding_name -> string
4232  *
4233  * Returns the source encoding name as a string.
4234  */
4235 static VALUE
4237 {
4238  return rb_attr_get(self, rb_intern("source_encoding_name"));
4239 }
4240 
4241 /*
4242  * call-seq:
4243  * ecerr.source_encoding -> encoding
4244  *
4245  * Returns the source encoding as an encoding object.
4246  *
4247  * Note that the result may not be equal to the source encoding of
4248  * the encoding converter if the conversion has multiple steps.
4249  *
4250  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4251  * begin
4252  * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4253  * rescue Encoding::UndefinedConversionError
4254  * p $!.source_encoding #=> #<Encoding:UTF-8>
4255  * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4256  * p $!.source_encoding_name #=> "UTF-8"
4257  * p $!.destination_encoding_name #=> "EUC-JP"
4258  * end
4259  *
4260  */
4261 static VALUE
4263 {
4264  return rb_attr_get(self, rb_intern("source_encoding"));
4265 }
4266 
4267 /*
4268  * call-seq:
4269  * ecerr.destination_encoding_name -> string
4270  *
4271  * Returns the destination encoding name as a string.
4272  */
4273 static VALUE
4275 {
4276  return rb_attr_get(self, rb_intern("destination_encoding_name"));
4277 }
4278 
4279 /*
4280  * call-seq:
4281  * ecerr.destination_encoding -> string
4282  *
4283  * Returns the destination encoding as an encoding object.
4284  */
4285 static VALUE
4287 {
4288  return rb_attr_get(self, rb_intern("destination_encoding"));
4289 }
4290 
4291 /*
4292  * call-seq:
4293  * ecerr.error_char -> string
4294  *
4295  * Returns the one-character string which cause Encoding::UndefinedConversionError.
4296  *
4297  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4298  * begin
4299  * ec.convert("\xa0")
4300  * rescue Encoding::UndefinedConversionError
4301  * puts $!.error_char.dump #=> "\xC2\xA0"
4302  * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4303  * end
4304  *
4305  */
4306 static VALUE
4308 {
4309  return rb_attr_get(self, rb_intern("error_char"));
4310 }
4311 
4312 /*
4313  * call-seq:
4314  * ecerr.error_bytes -> string
4315  *
4316  * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4317  *
4318  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4319  * begin
4320  * ec.convert("abc\xA1\xFFdef")
4321  * rescue Encoding::InvalidByteSequenceError
4322  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4323  * puts $!.error_bytes.dump #=> "\xA1"
4324  * puts $!.readagain_bytes.dump #=> "\xFF"
4325  * end
4326  */
4327 static VALUE
4329 {
4330  return rb_attr_get(self, rb_intern("error_bytes"));
4331 }
4332 
4333 /*
4334  * call-seq:
4335  * ecerr.readagain_bytes -> string
4336  *
4337  * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4338  */
4339 static VALUE
4341 {
4342  return rb_attr_get(self, rb_intern("readagain_bytes"));
4343 }
4344 
4345 /*
4346  * call-seq:
4347  * ecerr.incomplete_input? -> true or false
4348  *
4349  * Returns true if the invalid byte sequence error is caused by
4350  * premature end of string.
4351  *
4352  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4353  *
4354  * begin
4355  * ec.convert("abc\xA1z")
4356  * rescue Encoding::InvalidByteSequenceError
4357  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4358  * p $!.incomplete_input? #=> false
4359  * end
4360  *
4361  * begin
4362  * ec.convert("abc\xA1")
4363  * ec.finish
4364  * rescue Encoding::InvalidByteSequenceError
4365  * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4366  * p $!.incomplete_input? #=> true
4367  * end
4368  */
4369 static VALUE
4371 {
4372  return rb_attr_get(self, rb_intern("incomplete_input"));
4373 }
4374 
4375 /*
4376  * Document-class: Encoding::UndefinedConversionError
4377  *
4378  * Raised by Encoding and String methods when a transcoding operation
4379  * fails.
4380  */
4381 
4382 /*
4383  * Document-class: Encoding::InvalidByteSequenceError
4384  *
4385  * Raised by Encoding and String methods when the string being
4386  * transcoded contains a byte invalid for the either the source or
4387  * target encoding.
4388  */
4389 
4390 /*
4391  * Document-class: Encoding::ConverterNotFoundError
4392  *
4393  * Raised by transcoding methods when a named encoding does not
4394  * correspond with a known converter.
4395  */
4396 
4397 void
4399 {
4403 
4405 
4406  sym_invalid = ID2SYM(rb_intern("invalid"));
4407  sym_undef = ID2SYM(rb_intern("undef"));
4408  sym_replace = ID2SYM(rb_intern("replace"));
4409  sym_fallback = ID2SYM(rb_intern("fallback"));
4410  sym_aref = ID2SYM(rb_intern("[]"));
4411  sym_xml = ID2SYM(rb_intern("xml"));
4412  sym_text = ID2SYM(rb_intern("text"));
4413  sym_attr = ID2SYM(rb_intern("attr"));
4414 
4415  sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
4416  sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
4417  sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
4418  sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
4419  sym_finished = ID2SYM(rb_intern("finished"));
4420  sym_after_output = ID2SYM(rb_intern("after_output"));
4421  sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
4422  sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
4423  sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
4424  sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
4425  sym_partial_input = ID2SYM(rb_intern("partial_input"));
4426 
4427 #ifdef ENABLE_ECONV_NEWLINE_OPTION
4428  sym_newline = ID2SYM(rb_intern("newline"));
4429  sym_universal = ID2SYM(rb_intern("universal"));
4430  sym_crlf = ID2SYM(rb_intern("crlf"));
4431  sym_cr = ID2SYM(rb_intern("cr"));
4432  sym_lf = ID2SYM(rb_intern("lf"));
4433 #endif
4434 
4435  rb_define_method(rb_cString, "encode", str_encode, -1);
4436  rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4437 
4457 
4458  /* Document-const: INVALID_MASK
4459  *
4460  * Mask for invalid byte sequences
4461  */
4463 
4464  /* Document-const: INVALID_REPLACE
4465  *
4466  * Replace invalid byte sequences
4467  */
4469 
4470  /* Document-const: UNDEF_MASK
4471  *
4472  * Mask for a valid character in the source encoding but no related
4473  * character(s) in destination encoding.
4474  */
4476 
4477  /* Document-const: UNDEF_REPLACE
4478  *
4479  * Replace byte sequences that are undefined in the destination encoding.
4480  */
4482 
4483  /* Document-const: UNDEF_HEX_CHARREF
4484  *
4485  * Replace byte sequences that are undefined in the destination encoding
4486  * with an XML hexadecimal character reference. This is valid for XML
4487  * conversion.
4488  */
4490 
4491  /* Document-const: PARTIAL_INPUT
4492  *
4493  * Indicates the source may be part of a larger string. See
4494  * primitive_convert for an example.
4495  */
4497 
4498  /* Document-const: AFTER_OUTPUT
4499  *
4500  * Stop converting after some output is complete but before all of the
4501  * input was consumed. See primitive_convert for an example.
4502  */
4504 
4505  /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4506  *
4507  * Decorator for converting CRLF and CR to LF
4508  */
4510 
4511  /* Document-const: CRLF_NEWLINE_DECORATOR
4512  *
4513  * Decorator for converting LF to CRLF
4514  */
4516 
4517  /* Document-const: CR_NEWLINE_DECORATOR
4518  *
4519  * Decorator for converting LF to CR
4520  */
4522 
4523  /* Document-const: XML_TEXT_DECORATOR
4524  *
4525  * Escape as XML CharData
4526  */
4528 
4529  /* Document-const: XML_ATTR_CONTENT_DECORATOR
4530  *
4531  * Escape as XML AttValue
4532  */
4534 
4535  /* Document-const: XML_ATTR_QUOTE_DECORATOR
4536  *
4537  * Escape as XML AttValue
4538  */
4540 
4546 
4554 
4555  Init_newline();
4556 }
RUBY_EXTERN VALUE rb_cString
Definition: ruby.h:1591
#define BL_ACTION(byte)
#define FOURbt
static VALUE sym_replace
Definition: transcode.c:27
const char * ascii_incompat_name
Definition: transcode.c:1763
unsigned char ary[8]
Definition: transcode.c:67
int rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
Definition: transcode.c:2571
#define ECONV_XML_TEXT_DECORATOR
Definition: encoding.h:335
#define T_SYMBOL
Definition: ruby.h:494
Definition: string.c:5343
#define FUNio
search_path_queue_t * queue
Definition: transcode.c:250
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:739
void rb_econv_check_error(rb_econv_t *ec)
Definition: transcode.c:4219
VALUE next_info
Definition: transcode.c:60
RUBY_EXTERN VALUE rb_cData
Definition: ruby.h:1568
static VALUE econv_destination_encoding(VALUE self)
Definition: transcode.c:3484
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:139
static VALUE sym_undefined_conversion
Definition: transcode.c:38
#define NOMAP
VALUE rb_eConverterNotFoundError
Definition: transcode.c:23
rb_econv_result_t
Definition: encoding.h:252
VALUE rb_ary_entry(VALUE ary, long offset)
Definition: array.c:1179
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:140
unsigned char * in_buf_end
Definition: transcode.c:126
const unsigned char * error_bytes_start
Definition: transcode.c:139
#define RARRAY_LEN(a)
Definition: ruby.h:878
void rb_bug(const char *fmt,...)
Definition: error.c:327
rb_econv_result_t last_result
Definition: transcode.c:108
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:156
VALUE rb_econv_make_exception(rb_econv_t *ec)
Definition: transcode.c:4213
#define FALSE
Definition: nkf.h:174
#define RUBY_TYPED_FREE_IMMEDIATELY
Definition: ruby.h:1015
const char * dst_encoding
rb_econv_result_t result
Definition: transcode.c:135
long rb_str_coderange_scan_restartable(const char *, const char *, rb_encoding *, int *)
Definition: string.c:340
static VALUE sym_invalid_byte_sequence
Definition: transcode.c:37
size_t strlen(const char *)
#define INT2NUM(x)
Definition: ruby.h:1296
struct search_path_queue_tag search_path_queue_t
#define DECORATOR_P(sname, dname)
Definition: transcode.c:154
Definition: st.h:69
#define GB4bt
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1444
VALUE rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:2035
Definition: st.h:100
VALUE rb_cEncoding
Definition: encoding.c:37
VALUE rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
Definition: transcode.c:1811
#define NUM2INT(x)
Definition: ruby.h:630
static int max(int a, int b)
Definition: strftime.c:141
#define ZERObt
void rb_define_singleton_method(VALUE obj, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a singleton method for obj.
Definition: class.c:1646
static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), const char *src_encoding, const char *dst_encoding, int ecflags, VALUE ecopts)
Definition: transcode.c:2265
VALUE rb_eInvalidByteSequenceError
Definition: transcode.c:22
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Definition: encoding.h:336
static void econv_args(int argc, VALUE *argv, volatile VALUE *snamev_p, volatile VALUE *dnamev_p, const char **sname_p, const char **dname_p, rb_encoding **senc_p, rb_encoding **denc_p, int *ecflags_p, VALUE *ecopts_p)
Definition: transcode.c:2996
#define getGB4bt1(a)
#define FL_TAINT
Definition: ruby.h:1137
void rb_econv_binmode(rb_econv_t *ec)
Definition: transcode.c:1941
ssize_t writebuf_len
Definition: transcode.c:72
static void rb_transcoding_close(rb_transcoding *tc)
Definition: transcode.c:819
rb_encoding * source_encoding
Definition: transcode.c:146
static VALUE sym_newline
Definition: transcode.c:33
#define Qtrue
Definition: ruby.h:426
unsigned char * out_data_start
Definition: transcode.c:105
static int decorate_convpath(VALUE convpath, int ecflags)
Definition: transcode.c:3056
static int enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
Definition: transcode.c:2612
static VALUE sym_crlf_newline
Definition: transcode.c:30
void Init_newline(void)
Definition: newline.c:183
#define TypedData_Wrap_Struct(klass, data_type, sval)
Definition: ruby.h:1027
#define MAX_ECFLAGS_DECORATORS
Definition: transcode.c:1024
static size_t rb_transcoding_memsize(rb_transcoding *tc)
Definition: transcode.c:835
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:54
#define TypedData_Get_Struct(obj, type, data_type, sval)
Definition: ruby.h:1041
int rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
Definition: transcode.c:2526
unsigned char * in_data_start
Definition: transcode.c:124
#define ECONV_ERROR_HANDLER_MASK
Definition: encoding.h:318
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1924
static int str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2, const char **sname_p, rb_encoding **senc_p, const char **dname_p, rb_encoding **denc_p)
Definition: transcode.c:2636
VALUE rb_method_call(int, VALUE *, VALUE)
Definition: proc.c:1791
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:219
#define getBT3(a)
rb_encoding * destination_encoding
Definition: transcode.c:147
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Definition: encoding.h:339
struct rb_transcoding * tc
Definition: transcode.c:103
#define SUSPEND(ret, num)
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:102
#define next_byte
static VALUE sym_cr_newline
Definition: transcode.c:31
VALUE rb_eTypeError
Definition: error.c:548
#define next_table
static int str_transcode(int argc, VALUE *argv, VALUE *self)
Definition: transcode.c:2752
#define rb_check_arity
Definition: intern.h:296
static VALUE sym_aref
Definition: transcode.c:27
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
Definition: transcode.c:1854
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:900
VALUE rb_eEncodingError
Definition: error.c:554
void st_free_table(st_table *)
Definition: st.c:334
static VALUE econv_last_error(VALUE self)
Definition: transcode.c:4139
#define SYM2ID(x)
Definition: ruby.h:356
VALUE rb_obj_is_method(VALUE)
Definition: proc.c:1106
#define UNDEF
struct rb_transcoding * error_tc
Definition: transcode.c:136
static rb_econv_t * rb_econv_alloc(int n_hint)
Definition: transcode.c:853
void rb_str_set_len(VALUE, long)
Definition: string.c:2007
#define RBASIC_SET_CLASS(obj, cls)
Definition: internal.h:611
int rb_enc_str_coderange(VALUE)
Definition: string.c:435
static rb_econv_t * rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
Definition: transcode.c:930
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition: class.c:657
VALUE rb_to_int(VALUE)
Definition: object.c:2700
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:1857
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:826
static void rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:3183
#define RB_GC_GUARD(v)
Definition: ruby.h:523
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
#define T_HASH
Definition: ruby.h:485
const char * lib
Definition: transcode.c:159
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2884
#define THREEbt
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
Definition: transcode.c:2577
#define STR1
#define DATA_PTR(dta)
Definition: ruby.h:992
const rb_transcoder * transcoder
Definition: transcode.c:160
#define next_info
static int output_replacement_character(rb_econv_t *ec)
Definition: transcode.c:2229
#define T_ARRAY
Definition: ruby.h:484
const char * dname
Definition: transcode.c:158
static rb_econv_result_t rb_trans_conv(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags, int *result_position_ptr)
Definition: transcode.c:1173
void callback(ffi_cif *cif, void *resp, void **args, void *ctx)
Definition: closure.c:59
static rb_econv_result_t transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *tc, const int opt)
Definition: transcode.c:754
static VALUE econv_finish(VALUE self)
Definition: transcode.c:3879
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1257
static VALUE econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
Definition: transcode.c:1977
static transcoder_entry_t * make_transcoder_entry(const char *sname, const char *dname)
Definition: transcode.c:166
static const rb_transcoder * load_transcoder_entry(transcoder_entry_t *entry)
Definition: transcode.c:362
VALUE rb_str_tmp_new(long)
Definition: string.c:919
static int transcode_search_path(const char *sname, const char *dname, void(*callback)(const char *sname, const char *dname, int depth, void *arg), void *arg)
Definition: transcode.c:277
struct rb_econv_t::@157 last_error
unsigned char * in_buf_start
Definition: transcode.c:123
static rb_econv_t * rb_econv_open0(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:973
static void econv_free(void *ptr)
Definition: transcode.c:2909
const char * enc
Definition: transcode.c:245
static VALUE sym_source_buffer_empty
Definition: transcode.c:40
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Definition: transcode.c:1751
#define FUNsio
#define ENC_CODERANGE_7BIT
Definition: encoding.h:49
size_t error_bytes_len
Definition: transcode.c:140
const char * rb_obj_classname(VALUE)
Definition: variable.c:406
#define rb_ary_new2
Definition: intern.h:90
#define getGB4bt2(a)
static VALUE sym_crlf
Definition: transcode.c:33
static VALUE econv_convert(VALUE self, VALUE source_string)
Definition: transcode.c:3829
static VALUE sym_partial_input
Definition: transcode.c:35
static const char transcoder_lib_prefix[]
Definition: transcode.c:230
RUBY_SYMBOL_EXPORT_BEGIN typedef unsigned long st_data_t
Definition: st.h:20
static rb_econv_t * rb_econv_init_by_convpath(VALUE self, VALUE convpath, const char **sname_p, const char **dname_p, rb_encoding **senc_p, rb_encoding **denc_p)
Definition: transcode.c:3198
void rb_exc_raise(VALUE mesg)
Definition: eval.c:567
static unsigned char * output
Definition: nkf.c:32
static const char * get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
Definition: transcode.c:390
#define OBJ_INFECT_RAW(x, s)
Definition: ruby.h:1187
static VALUE str_encode_associate(VALUE str, int encidx)
Definition: transcode.c:2766
st_table * st_init_strcasetable(void)
Definition: st.c:296
#define FUNii
st_table * visited
Definition: transcode.c:249
#define RB_TYPE_P(obj, type)
Definition: ruby.h:1672
static VALUE ecerr_incomplete_input(VALUE self)
Definition: transcode.c:4370
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Definition: transcode.c:3168
#define fail()
int st_lookup(st_table *, st_data_t, st_data_t *)
static unsigned char * str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
Definition: transcode.c:2420
int rb_to_encoding_index(VALUE enc)
Definition: encoding.c:171
ssize_t readagain_len
Definition: transcode.c:65
static VALUE econv_primitive_errinfo(VALUE self)
Definition: transcode.c:3987
unsigned int output_index
Definition: transcode.c:62
unsigned int input
Definition: nkf.c:4311
#define TRANSCODING_READBUF(tc)
Definition: transcode.c:84
static size_t econv_memsize(const void *ptr)
Definition: transcode.c:2916
#define ALLOC_N(type, n)
Definition: ruby.h:1341
void Init_transcode(void)
Definition: transcode.c:4398
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Definition: hash.c:1402
unsigned char * in_data_end
Definition: transcode.c:125
static VALUE str_encode_bang(int argc, VALUE *argv, VALUE str)
Definition: transcode.c:2798
Definition: transcode.c:156
static VALUE str_encode(int argc, VALUE *argv, VALUE str)
Definition: transcode.c:2876
int num_finished
Definition: transcode.c:130
const char * destination_encoding
Definition: transcode.c:138
static int rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
Definition: transcode.c:1901
#define val
int resume_position
Definition: transcode.c:58
#define ECONV_INVALID_MASK
Definition: encoding.h:320
VALUE rb_eRuntimeError
Definition: error.c:547
#define RSTRING_END(str)
Definition: ruby.h:849
struct rb_econv_t rb_econv_t
Definition: encoding.h:262
#define SUSPEND_AFTER_OUTPUT(num)
#define getGB4bt3(a)
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:510
VALUE rb_str_cat2(VALUE, const char *)
Definition: string.c:2158
#define ECONV_INVALID_REPLACE
Definition: encoding.h:321
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1700
VALUE rb_ary_new(void)
Definition: array.c:499
#define dp(v)
Definition: vm_debug.h:21
static VALUE econv_get_replacement(VALUE self)
Definition: transcode.c:4163
#define ECONV_PARTIAL_INPUT
Definition: encoding.h:350
#define ECONV_AFTER_OUTPUT
Definition: encoding.h:351
#define snprintf
Definition: subst.h:6
#define NIL_P(v)
Definition: ruby.h:438
void st_add_direct(st_table *, st_data_t, st_data_t)
Definition: st.c:629
static void more_output_buffer(VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), int max_output, unsigned char **out_start_ptr, unsigned char **out_pos, unsigned char **out_stop_ptr)
Definition: transcode.c:2147
union rb_transcoding::@155 readbuf
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:2228
void rb_ary_store(VALUE ary, long idx, VALUE val)
Definition: array.c:794
static VALUE sym_attr
Definition: transcode.c:28
static VALUE econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
Definition: transcode.c:3139
#define OBJ_FROZEN(x)
Definition: ruby.h:1193
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Definition: transcode.c:1864
static st_table * transcoder_table
Definition: transcode.c:163
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Definition: transcode.c:1585
const char * sname
Definition: transcode.c:157
int argc
Definition: ruby.c:131
#define Qfalse
Definition: ruby.h:425
static VALUE make_econv_exception(rb_econv_t *ec)
Definition: transcode.c:2046
VALUE rb_cEncodingConverter
Definition: transcode.c:25
VALUE rb_require_safe(VALUE, int)
Definition: load.c:945
static const rb_data_type_t econv_data_type
Definition: transcode.c:2921
#define ALLOCA_N(type, n)
Definition: ruby.h:1345
static VALUE econv_set_replacement(VALUE self, VALUE arg)
Definition: transcode.c:4189
#define TRANSCODING_STATE(tc)
Definition: transcode.c:97
#define LONG_MAX
Definition: ruby.h:191
#define MEMCPY(p1, p2, type, n)
Definition: ruby.h:1360
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:51
static VALUE sym_fallback
Definition: transcode.c:27
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:798
#define rb_ary_new4
Definition: intern.h:92
#define rb_str_new2
Definition: intern.h:840
int err
Definition: win32.c:114
#define OBJ_FREEZE(x)
Definition: ruby.h:1194
static VALUE method_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2253
rb_transcoder_asciicompat_type_t asciicompat_type
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
Definition: transcode.c:233
#define PRIdPTRDIFF
Definition: ruby.h:161
static VALUE econv_equal(VALUE self, VALUE other)
Definition: transcode.c:3539
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1907
#define ENC_CODERANGE_VALID
Definition: encoding.h:50
#define ECONV_UNDEF_MASK
Definition: encoding.h:323
#define ALLOC(type)
Definition: ruby.h:1342
#define SUSPEND_OBUF(num)
VALUE rb_str_resize(VALUE, long)
Definition: string.c:2024
static int str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
Definition: transcode.c:2663
void rb_register_transcoder(const rb_transcoder *tr)
Definition: transcode.c:205
size_t rb_str_capacity(VALUE)
Definition: string.c:468
unsigned char * out_buf_start
Definition: transcode.c:104
static int transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
Definition: transcode.c:256
#define getGB4bt0(a)
static VALUE econv_putback(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:4090
ssize_t recognized_len
Definition: transcode.c:64
static VALUE sym_xml
Definition: transcode.c:28
int num_trans
Definition: transcode.c:129
#define FUNso
static void search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:3095
#define RSTRING_LEN(str)
Definition: ruby.h:841
static rb_econv_t * check_econv(VALUE self)
Definition: transcode.c:3451
int num_additional
Definition: transcode.c:958
#define REALLOC_N(var, type, n)
Definition: ruby.h:1343
#define TRUE
Definition: nkf.h:175
VALUE rb_obj_is_proc(VALUE)
Definition: proc.c:94
static VALUE econv_s_allocate(VALUE klass)
Definition: transcode.c:2928
search_path_queue_t ** queue_last_ptr
Definition: transcode.c:251
VALUE rb_sprintf(const char *format,...)
Definition: sprintf.c:1250
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:958
static VALUE econv_insert_output(VALUE self, VALUE string)
Definition: transcode.c:4045
static VALUE ecerr_destination_encoding(VALUE self)
Definition: transcode.c:4286
int rb_econv_putbackable(rb_econv_t *ec)
Definition: transcode.c:1740
#define rb_enc_name(enc)
Definition: encoding.h:125
#define RSTRING_EMBED_LEN_MAX
Definition: ruby.h:819
unsigned char * out_buf_end
Definition: transcode.c:107
static int decorator_names(int ecflags, const char **decorators_ret)
Definition: transcode.c:1027
unsigned char next_byte
Definition: transcode.c:61
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Definition: transcode.c:2197
struct rb_transcoding * last_tc
Definition: transcode.c:131
#define MEMMOVE(p1, p2, type, n)
Definition: ruby.h:1361
#define STR1_BYTEINDEX(w)
VALUE rb_hash_new(void)
Definition: hash.c:307
static VALUE aref_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2259
static VALUE make_encobj(const char *name)
Definition: transcode.c:2954
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Definition: class.c:1719
const char * base_enc
Definition: transcode.c:252
VALUE rb_ivar_set(VALUE, ID, VALUE)
Definition: variable.c:1133
VALUE rb_check_hash_type(VALUE hash)
Definition: hash.c:597
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4308
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Definition: array.c:620
char ary[sizeof(double) > sizeof(void *) ? sizeof(double) :sizeof(void *)]
Definition: transcode.c:80
#define ECONV_CRLF_NEWLINE_DECORATOR
Definition: encoding.h:333
const char * source_encoding
Definition: transcode.c:137
#define Qnil
Definition: ruby.h:427
static VALUE sym_lf
Definition: transcode.c:33
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:437
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Definition: transcode.c:1870
static VALUE econv_init(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:3372
unsigned long VALUE
Definition: ruby.h:88
static VALUE result
Definition: nkf.c:40
static VALUE sym_universal_newline
Definition: transcode.c:29
union rb_transcoding::rb_transcoding_state_t state
#define ECONV_NEWLINE_DECORATOR_MASK
Definition: encoding.h:328
const char * src_encoding
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:930
#define ECONV_UNDEF_HEX_CHARREF
Definition: encoding.h:325
#define getBT1(a)
static void trans_open_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:962
#define rb_enc_asciicompat(enc)
Definition: encoding.h:188
static VALUE sym_universal
Definition: transcode.c:33
VALUE rb_str_new_cstr(const char *)
Definition: string.c:560
int memcmp(const void *s1, const void *s2, size_t len)
Definition: memcmp.c:7
static VALUE ecerr_error_char(VALUE self)
Definition: transcode.c:4307
VALUE rb_str_dump(VALUE)
Definition: string.c:4902
VALUE rb_proc_call(VALUE, VALUE)
Definition: proc.c:744
const char * ascii_compat_name
Definition: transcode.c:1762
unsigned char * ptr
Definition: transcode.c:68
static rb_encoding * make_encoding(const char *name)
Definition: transcode.c:2944
#define ECONV_CR_NEWLINE_DECORATOR
Definition: encoding.h:334
#define RARRAY_LENINT(ary)
Definition: ruby.h:884
VALUE rb_str_dup(VALUE)
Definition: string.c:1062
static VALUE econv_source_encoding(VALUE self)
Definition: transcode.c:3469
static VALUE proc_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2247
static VALUE sym_cr
Definition: transcode.c:33
static VALUE sym_finished
Definition: transcode.c:41
VALUE rb_hash_freeze(VALUE hash)
Definition: hash.c:62
#define FUNsi
#define FL_UNSET(x, f)
Definition: ruby.h:1177
#define INVALID
#define BL_MIN_BYTE
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:1651
#define StringValueCStr(v)
Definition: ruby.h:541
static int make_replacement(rb_econv_t *ec)
Definition: transcode.c:2163
#define writebuf_len
#define RSTRING_PTR(str)
Definition: ruby.h:845
static rb_econv_result_t transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *tc, const int opt)
Definition: transcode.c:429
#define ONEbt
#define rb_exc_new3
Definition: intern.h:248
#define ECONV_UNDEF_REPLACE
Definition: encoding.h:324
void rb_str_modify(VALUE)
Definition: string.c:1483
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:832
static VALUE sym_after_output
Definition: transcode.c:42
int size
Definition: encoding.c:49
static VALUE econv_inspect(VALUE self)
Definition: transcode.c:3431
#define f
#define INT2FIX(i)
Definition: ruby.h:231
static rb_transcoding * rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
Definition: transcode.c:780
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:520
#define RARRAY_AREF(a, i)
Definition: ruby.h:901
unsigned char * out_data_end
Definition: transcode.c:106
static rb_econv_result_t rb_econv_convert0(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1264
#define xmalloc
Definition: defines.h:108
#define SIZE_MAX
Definition: ruby.h:274
static int asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
Definition: transcode.c:1767
size_t rb_econv_memsize(rb_econv_t *ec)
Definition: transcode.c:1718
rb_econv_t * rb_econv_open(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:1065
union rb_transcoding::@156 writebuf
#define TRANSCODING_WRITEBUF(tc)
Definition: transcode.c:88
static const unsigned char * transcode_char_start(rb_transcoding *tc, const unsigned char *in_start, const unsigned char *inchar_start, const unsigned char *in_p, size_t *char_len_ptr)
Definition: transcode.c:409
VALUE rb_check_array_type(VALUE ary)
Definition: array.c:632
VALUE rb_hash_aref(VALUE hash, VALUE key)
Definition: hash.c:706
void rb_error_arity(int argc, int min, int max)
static VALUE ecerr_error_bytes(VALUE self)
Definition: transcode.c:4328
static rb_econv_result_t rb_transcoding_convert(rb_transcoding *tc, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:807
VALUE rb_str_catf(VALUE str, const char *format,...)
Definition: sprintf.c:1290
#define rb_funcall3
Definition: ruby.h:1465
uint8_t key[16]
Definition: random.c:1250
void rb_str_shared_replace(VALUE, VALUE)
Definition: string.c:972
#define RTEST(v)
Definition: ruby.h:437
static void declare_transcoder(const char *sname, const char *dname, const char *lib)
Definition: transcode.c:222
unsigned int next_table
Definition: transcode.c:59
size_t readagain_len
Definition: transcode.c:141
static int rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
Definition: transcode.c:1882
static VALUE sym_invalid
Definition: transcode.c:27
static int rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
Definition: transcode.c:891
#define getBT2(a)
static VALUE econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
Definition: transcode.c:2978
int num_allocated
Definition: transcode.c:128
#define BYTE_ADDR(index)
const char * destination_encoding_name
Definition: transcode.c:114
static VALUE econv_convpath(VALUE self)
Definition: transcode.c:3515
static int trans_sweep(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags, int start)
Definition: transcode.c:1092
VALUE rb_enc_default_internal(void)
Definition: encoding.c:1460
const char * rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
Definition: transcode.c:1784
VALUE rb_enc_str_new(const char *, long, rb_encoding *)
Definition: string.c:548
#define rb_safe_level()
Definition: tcltklib.c:95
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Definition: transcode.c:1501
static VALUE ecerr_source_encoding(VALUE self)
Definition: transcode.c:4262
static int output_hex_charref(rb_econv_t *ec)
Definition: transcode.c:1388
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Definition: transcode.c:1876
#define hash_fallback
Definition: transcode.c:2244
static VALUE ecerr_readagain_bytes(VALUE self)
Definition: transcode.c:4340
const char * name
Definition: nkf.c:208
#define xrealloc
Definition: defines.h:111
#define ID2SYM(x)
Definition: ruby.h:355
VALUE rb_eUndefinedConversionError
Definition: transcode.c:21
const char * rb_id2name(ID id)
Definition: ripper.c:17271
int started
Definition: transcode.c:116
rb_econv_elem_t * elems
Definition: transcode.c:127
static VALUE sym_text
Definition: transcode.c:28
const char * replacement_enc
Definition: transcode.c:120
VALUE rb_str_new_frozen(VALUE)
Definition: string.c:833
VALUE rb_str_drop_bytes(VALUE, long)
Definition: string.c:3640
const char * source_encoding_name
Definition: transcode.c:113
size_t replacement_len
Definition: transcode.c:119
int replacement_allocated
Definition: transcode.c:121
static VALUE sym_undef
Definition: transcode.c:27
#define BL_MAX_BYTE
struct search_path_queue_tag * next
Definition: transcode.c:244
int rb_enc_find_index(const char *name)
Definition: encoding.c:684
static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx)
Definition: transcode.c:2894
static int econv_opts(VALUE opt, int ecflags)
Definition: transcode.c:2427
#define rb_check_frozen(obj)
Definition: intern.h:277
static VALUE sym_destination_buffer_full
Definition: transcode.c:39
#define getBT0(a)
static unsigned char * allocate_converted_string(const char *sname, const char *dname, const unsigned char *str, size_t len, unsigned char *caller_dst_buf, size_t caller_dst_bufsize, size_t *dst_len_ptr)
Definition: transcode.c:1517
void void xfree(void *)
const rb_transcoder * transcoder
Definition: transcode.c:54
static transcoder_entry_t * get_transcoder_entry(const char *sname, const char *dname)
Definition: transcode.c:189
#define rb_intern(str)
ssize_t writebuf_off
Definition: transcode.c:71
VALUE rb_str_buf_new(long)
Definition: string.c:891
#define SYMBOL_P(x)
Definition: ruby.h:354
#define TWObt
VALUE rb_str_scrub(VALUE, VALUE)
Definition: string.c:8037
#define NULL
Definition: _sdbm.c:102
struct rb_transcoding rb_transcoding
#define Qundef
Definition: ruby.h:428
st_index_t num_entries
Definition: st.h:85
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Definition: class.c:1479
int st_foreach(st_table *, int(*)(ANYARGS), st_data_t)
Definition: st.c:1034
const unsigned char * replacement_str
Definition: transcode.c:118
#define bp()
Definition: vm_debug.h:25
#define STR1_LENGTH(byte_addr)
#define encoding_equal(enc1, enc2)
Definition: transcode.c:241
#define TRANSCODING_WRITEBUF_SIZE(tc)
Definition: transcode.c:92
static rb_encoding * make_dummy_encoding(const char *name)
Definition: transcode.c:2934
VALUE rb_eArgError
Definition: error.c:549
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Definition: encoding.h:332
#define writebuf_off
rb_encoding * rb_enc_find(const char *name)
Definition: encoding.c:708
#define NUM2LONG(x)
Definition: ruby.h:600
transcoder_entry_t ** entries
Definition: transcode.c:957
static VALUE econv_result_to_symbol(rb_econv_result_t res)
Definition: transcode.c:3574
VALUE rb_attr_get(VALUE, ID)
Definition: variable.c:1127
char ** argv
Definition: ruby.c:132
#define StringValue(v)
Definition: ruby.h:539
static VALUE ecerr_source_encoding_name(VALUE self)
Definition: transcode.c:4236
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:590
static VALUE econv_primitive_convert(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:3683
VALUE rb_obj_class(VALUE)
Definition: object.c:226
VALUE rb_str_new(const char *, long)
Definition: string.c:534
static VALUE ecerr_destination_encoding_name(VALUE self)
Definition: transcode.c:4274
static VALUE sym_incomplete_input
Definition: transcode.c:43