#include <stdio.h>
#include <errno.h>
#include <ctype.h>

/*

  RFC 2279 -- UTF-8, a transformation format of ISO 10646

  UCS-4 range (hex.)    UTF-8 octet sequence (binary)
  0000 0000-0000 007F   0xxxxxxx
  0000 0080-0000 07FF   110xxxxx 10xxxxxx
  0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx

  0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx

  HTML 4.01 Specification
  5.3.1 Numeric character references
    <URL:http://www.w3.org/TR/html40/charset.html#h-5.3.1>

  * The syntax "&#D;", where D is a decimal number, refers to the ISO
    10646 decimal character number D.

  * The syntax "&#xH;" or "&#XH;", where H is a hexadecimal number,
    refers to the ISO 10646 hexadecimal character number H. Hexadecimal
    numbers in numeric character references are case-insensitive.

*/

#define BUFLEN 256

void
ucs4_to_utf8(int d)
{
  if (d == 9825) {
    printf("&#9825;");
    return;
  }
  if (d <= 0x7f) {
    putchar(d);
  } else if (d <= 0x7ff) {
    putchar(((d >> 6) & 0x1f) | 0xc0);
    putchar((d & 0x3f) | 0x80);
  }
  putchar(((d >> 12) & 0x0f) | 0xe0);
  putchar((d >> 6) & 0x3f | 0x80);
  putchar((d & 0x3f) | 0x80);
}

void
convert(FILE* fp)
{
  int c, d;
  int idx;
  int hex;
  char buf[BUFLEN];
  
  while (!feof(fp)) {
    if ((c = fgetc(fp)) == EOF)
      break;
    if (c != '&') {
      putchar(c);
      continue;
    }

    if ((c = fgetc(fp)) == EOF) {
      putchar('&');
      break;
    }
    if (c != '#') {
      printf("&%c", c);
      continue;
    }
    idx = 0;
    d = 0;
    hex = 0;
    while (!feof(fp) && idx < BUFLEN - 1) {
      if ((c = fgetc(fp)) == EOF)
	break;
      /*if (idx == 0 && (c == 'x' || c == 'X')) {
	hex = c;
	continue;
	}*/
      buf[idx++] = c;
      if (isdigit(c)) {
	d = d * 10 + (c - '0');
      } else {
	break;
      }
    }
    if (!feof(fp) && c == ';') {
      ucs4_to_utf8(d);
    } else {
      buf[idx] = '\0';
      printf("&#%s;", buf);
    }
  }
}

void
usage()
{
  fprintf(stderr, "usage: charunref <file>\n");
  exit (1);
}

int
main(int argc, char* argv[])
{
  if (argc != 2 || !strcmp(argv[1], "-h")) {
    usage();
  } else {
    FILE* fp = strcmp(argv[1], "-") ? fopen(argv[1], "r") : stdin;
    if (!fp) {
      fprintf(stderr, "%s: Error: %s\n", argv[1], strerror(errno));
      exit(1);
    }
    convert(fp);
    if (strcmp(argv[1], "-")) fclose(fp);
  }
  return 0;
}

/* end of charunref.c */
