diff options
Diffstat (limited to 'util/sbase/sort.c')
-rw-r--r-- | util/sbase/sort.c | 437 |
1 files changed, 437 insertions, 0 deletions
diff --git a/util/sbase/sort.c b/util/sbase/sort.c new file mode 100644 index 00000000..fbb1abfe --- /dev/null +++ b/util/sbase/sort.c @@ -0,0 +1,437 @@ +/* See LICENSE file for copyright and license details. */ +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "queue.h" +#include "text.h" +#include "utf.h" +#include "util.h" + +struct keydef { + int start_column; + int end_column; + int start_char; + int end_char; + int flags; + TAILQ_ENTRY(keydef) entry; +}; + +struct column { + struct line line; + size_t cap; +}; + +enum { + MOD_N = 1 << 0, + MOD_STARTB = 1 << 1, + MOD_ENDB = 1 << 2, + MOD_R = 1 << 3, + MOD_D = 1 << 4, + MOD_F = 1 << 5, + MOD_I = 1 << 6, +}; + +static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead); + +static int Cflag = 0, cflag = 0, uflag = 0; +static char *fieldsep = NULL; +static size_t fieldseplen = 0; +static struct column col1, col2; + +static void +skipblank(struct line *a) +{ + while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) { + a->data++; + a->len--; + } +} + +static void +skipnonblank(struct line *a) +{ + while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' && + *(a->data) != '\t')) { + a->data++; + a->len--; + } +} + +static void +skipcolumn(struct line *a, int skip_to_next_col) +{ + char *s; + + if (fieldsep) { + if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) { + if (skip_to_next_col) + s += fieldseplen; + a->len -= s - a->data; + a->data = s; + } else { + a->data += a->len - 1; + a->len = 1; + } + } else { + skipblank(a); + skipnonblank(a); + } +} + +static void +columns(struct line *line, const struct keydef *kd, struct column *col) +{ + Rune r; + struct line start, end; + size_t utflen, rlen; + int i; + + start.data = line->data; + start.len = line->len; + for (i = 1; i < kd->start_column; i++) + skipcolumn(&start, 1); + if (kd->flags & MOD_STARTB) + skipblank(&start); + for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) { + rlen = chartorune(&r, start.data); + start.data += rlen; + start.len -= rlen; + utflen++; + } + + end.data = line->data; + end.len = line->len; + if (kd->end_column) { + for (i = 1; i < kd->end_column; i++) + skipcolumn(&end, 1); + if (kd->flags & MOD_ENDB) + skipblank(&end); + if (kd->end_char) { + for (utflen = 0; end.len > 1 && utflen < kd->end_char;) { + rlen = chartorune(&r, end.data); + end.data += rlen; + end.len -= rlen; + utflen++; + } + } else { + skipcolumn(&end, 0); + } + } else { + end.data += end.len - 1; + end.len = 1; + } + col->line.len = MAX(0, end.data - start.data); + if (!(col->line.data) || col->cap < col->line.len + 1) { + free(col->line.data); + col->line.data = emalloc(col->line.len + 1); + } + memcpy(col->line.data, start.data, col->line.len); + col->line.data[col->line.len] = '\0'; +} + +static int +skipmodcmp(struct line *a, struct line *b, int flags) +{ + Rune r1, r2; + size_t offa = 0, offb = 0; + + do { + offa += chartorune(&r1, a->data + offa); + offb += chartorune(&r2, b->data + offb); + + if (flags & MOD_D && flags & MOD_I) { + while (offa < a->len && ((!isblankrune(r1) && + !isalnumrune(r1)) || (!isprintrune(r1)))) + offa += chartorune(&r1, a->data + offa); + while (offb < b->len && ((!isblankrune(r2) && + !isalnumrune(r2)) || (!isprintrune(r2)))) + offb += chartorune(&r2, b->data + offb); + } + else if (flags & MOD_D) { + while (offa < a->len && !isblankrune(r1) && + !isalnumrune(r1)) + offa += chartorune(&r1, a->data + offa); + while (offb < b->len && !isblankrune(r2) && + !isalnumrune(r2)) + offb += chartorune(&r2, b->data + offb); + } + else if (flags & MOD_I) { + while (offa < a->len && !isprintrune(r1)) + offa += chartorune(&r1, a->data + offa); + while (offb < b->len && !isprintrune(r2)) + offb += chartorune(&r2, b->data + offb); + } + if (flags & MOD_F) { + r1 = toupperrune(r1); + r2 = toupperrune(r2); + } + } while (r1 && r1 == r2); + + return r1 - r2; +} + +static int +slinecmp(struct line *a, struct line *b) +{ + int res = 0; + double x, y; + struct keydef *kd; + + TAILQ_FOREACH(kd, &kdhead, entry) { + columns(a, kd, &col1); + columns(b, kd, &col2); + + /* if -u is given, don't use default key definition + * unless it is the only one */ + if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) && + TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) { + res = 0; + } else if (kd->flags & MOD_N) { + x = strtod(col1.line.data, NULL); + y = strtod(col2.line.data, NULL); + res = (x < y) ? -1 : (x > y); + } else if (kd->flags & (MOD_D | MOD_F | MOD_I)) { + res = skipmodcmp(&col1.line, &col2.line, kd->flags); + } else { + res = linecmp(&col1.line, &col2.line); + } + + if (kd->flags & MOD_R) + res = -res; + if (res) + break; + } + + return res; +} + +static int +check(FILE *fp, const char *fname) +{ + static struct line prev, cur, tmp; + static size_t prevsize, cursize, tmpsize; + ssize_t len; + + if (!prev.data) { + if ((len = getline(&prev.data, &prevsize, fp)) < 0) + eprintf("getline:"); + prev.len = len; + } + while ((len = getline(&cur.data, &cursize, fp)) > 0) { + cur.len = len; + if (uflag > slinecmp(&cur, &prev)) { + if (!Cflag) { + weprintf("disorder %s: ", fname); + fwrite(cur.data, 1, cur.len, stderr); + } + return 1; + } + tmp = cur; + tmpsize = cursize; + cur = prev; + cursize = prevsize; + prev = tmp; + prevsize = tmpsize; + } + + return 0; +} + +static int +parse_flags(char **s, int *flags, int bflag) +{ + while (isalpha((int)**s)) { + switch (*((*s)++)) { + case 'b': + *flags |= bflag; + break; + case 'd': + *flags |= MOD_D; + break; + case 'f': + *flags |= MOD_F; + break; + case 'i': + *flags |= MOD_I; + break; + case 'n': + *flags |= MOD_N; + break; + case 'r': + *flags |= MOD_R; + break; + default: + return -1; + } + } + + return 0; +} + +static void +addkeydef(char *kdstr, int flags) +{ + struct keydef *kd; + + kd = enmalloc(2, sizeof(*kd)); + + /* parse key definition kdstr with format + * start_column[.start_char][flags][,end_column[.end_char][flags]] + */ + kd->start_column = 1; + kd->start_char = 1; + kd->end_column = 0; /* 0 means end of line */ + kd->end_char = 0; /* 0 means end of column */ + kd->flags = flags; + + if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1) + enprintf(2, "invalid start column in key definition\n"); + + if (*kdstr == '.') { + if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1) + enprintf(2, "invalid start character in key " + "definition\n"); + } + if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0) + enprintf(2, "invalid start flags in key definition\n"); + + if (*kdstr == ',') { + if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0) + enprintf(2, "invalid end column in key definition\n"); + if (*kdstr == '.') { + if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0) + enprintf(2, "invalid end character in key " + "definition\n"); + } + if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0) + enprintf(2, "invalid end flags in key definition\n"); + } + + if (*kdstr != '\0') + enprintf(2, "invalid key definition\n"); + + TAILQ_INSERT_TAIL(&kdhead, kd, entry); +} + +static void +usage(void) +{ + enprintf(2, "usage: %s [-Cbcdfimnru] [-o outfile] [-t delim] " + "[-k def]... [file ...]\n", argv0); +} + +int +main(int argc, char *argv[]) +{ + FILE *fp, *ofp = stdout; + struct linebuf linebuf = EMPTY_LINEBUF; + size_t i; + int global_flags = 0, ret = 0; + char *outfile = NULL; + + ARGBEGIN { + case 'C': + Cflag = 1; + break; + case 'b': + global_flags |= MOD_STARTB | MOD_ENDB; + break; + case 'c': + cflag = 1; + break; + case 'd': + global_flags |= MOD_D; + break; + case 'f': + global_flags |= MOD_F; + break; + case 'i': + global_flags |= MOD_I; + break; + case 'k': + addkeydef(EARGF(usage()), global_flags); + break; + case 'm': + /* more or less for free, but for performance-reasons, + * we should keep this flag in mind and maybe some later + * day implement it properly so we don't run out of memory + * while merging large sorted files. + */ + break; + case 'n': + global_flags |= MOD_N; + break; + case 'o': + outfile = EARGF(usage()); + break; + case 'r': + global_flags |= MOD_R; + break; + case 't': + fieldsep = EARGF(usage()); + if (!*fieldsep) + eprintf("empty delimiter\n"); + fieldseplen = unescape(fieldsep); + break; + case 'u': + uflag = 1; + break; + default: + usage(); + } ARGEND + + /* -b shall only apply to custom key definitions */ + if (TAILQ_EMPTY(&kdhead) && global_flags) + addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB)); + if (TAILQ_EMPTY(&kdhead) || (!Cflag && !cflag)) + addkeydef("1", global_flags & MOD_R); + + if (!argc) { + if (Cflag || cflag) { + if (check(stdin, "<stdin>") && !ret) + ret = 1; + } else { + getlines(stdin, &linebuf); + } + } else for (; *argv; argc--, argv++) { + if (!strcmp(*argv, "-")) { + *argv = "<stdin>"; + fp = stdin; + } else if (!(fp = fopen(*argv, "r"))) { + enprintf(2, "fopen %s:", *argv); + continue; + } + if (Cflag || cflag) { + if (check(fp, *argv) && !ret) + ret = 1; + } else { + getlines(fp, &linebuf); + } + if (fp != stdin && fshut(fp, *argv)) + ret = 2; + } + + if (!Cflag && !cflag) { + if (outfile && !(ofp = fopen(outfile, "w"))) + eprintf("fopen %s:", outfile); + + qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines), + (int (*)(const void *, const void *))slinecmp); + + for (i = 0; i < linebuf.nlines; i++) { + if (!uflag || i == 0 || + slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) { + fwrite(linebuf.lines[i].data, 1, + linebuf.lines[i].len, ofp); + } + } + } + + if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") | + fshut(stderr, "<stderr>")) + ret = 2; + + return ret; +} |