Date: Sat, 25 Sep 2021 08:41:29 GMT From: Piotr Pawel Stefaniak <pstef@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org Subject: git: bda949b6efdf - stable/13 - diff: read whole files to determine if they are ASCII text Message-ID: <202109250841.18P8fTiU018051@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch stable/13 has been updated by pstef: URL: https://cgit.FreeBSD.org/src/commit/?id=bda949b6efdf099846775d3dc595c36561df1cca commit bda949b6efdf099846775d3dc595c36561df1cca Author: Piotr Pawel Stefaniak <pstef@FreeBSD.org> AuthorDate: 2021-08-22 19:57:13 +0000 Commit: Piotr Pawel Stefaniak <pstef@FreeBSD.org> CommitDate: 2021-09-25 08:34:39 +0000 diff: read whole files to determine if they are ASCII text Before this change, only the first BUFSIZE bytes were checked. (cherry picked from commit 3cbf98e2bee91db9ed9118ff557e02cdd449f49a) --- usr.bin/diff/diffreg.c | 59 +++++++++++++++++++++++++---------------- usr.bin/diff/tests/diff_test.sh | 14 ++++++++++ 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/usr.bin/diff/diffreg.c b/usr.bin/diff/diffreg.c index 113b7b621256..e728441c2cb2 100644 --- a/usr.bin/diff/diffreg.c +++ b/usr.bin/diff/diffreg.c @@ -180,6 +180,8 @@ struct context_vec { int d; /* end line in new file */ }; +enum readhash { RH_BINARY, RH_OK, RH_EOF }; + #define MIN_PAD 1 static FILE *opentemp(const char *); static void output(char *, FILE *, char *, FILE *, int); @@ -188,7 +190,7 @@ static void range(int, int, const char *); static void uni_range(int, int); static void dump_context_vec(FILE *, FILE *, int); static void dump_unified_vec(FILE *, FILE *, int); -static void prepare(int, FILE *, size_t, int); +static bool prepare(int, FILE *, size_t, int); static void prune(void); static void equiv(struct line *, int, struct line *, int, int *); static void unravel(int); @@ -206,7 +208,7 @@ static int search(int *, int, int); static int skipline(FILE *); static int isqrt(int); static int stone(int *, int, int *, int *, int); -static int readhash(FILE *, int); +static enum readhash readhash(FILE *, int, unsigned *); static int files_differ(FILE *, FILE *, int); static char *match_function(const long *, int, FILE *); static char *preadline(int, size_t, off_t); @@ -380,14 +382,16 @@ diffreg(char *file1, char *file2, int flags, int capsicum) status |= 1; goto closem; } - if ((flags & D_FORCEASCII) == 0 && - (!asciifile(f1) || !asciifile(f2))) { + if ((flags & D_FORCEASCII) != 0) { + (void)prepare(0, f1, stb1.st_size, flags); + (void)prepare(1, f2, stb2.st_size, flags); + } else if (!asciifile(f1) || !asciifile(f2) || + !prepare(0, f1, stb1.st_size, flags) || + !prepare(1, f2, stb2.st_size, flags)) { rval = D_BINARY; status |= 1; goto closem; } - prepare(0, f1, stb1.st_size, flags); - prepare(1, f2, stb2.st_size, flags); prune(); sort(sfile[0], slen[0]); @@ -511,12 +515,13 @@ splice(char *dir, char *path) return (buf); } -static void +static bool prepare(int i, FILE *fd, size_t filesize, int flags) { struct line *p; - int h; - size_t sz, j; + unsigned h; + size_t sz, j = 0; + enum readhash r; rewind(fd); @@ -525,15 +530,23 @@ prepare(int i, FILE *fd, size_t filesize, int flags) sz = 100; p = xcalloc(sz + 3, sizeof(*p)); - for (j = 0; (h = readhash(fd, flags));) { - if (j == sz) { - sz = sz * 3 / 2; - p = xreallocarray(p, sz + 3, sizeof(*p)); + while ((r = readhash(fd, flags, &h)) != RH_EOF) + switch (r) { + case RH_EOF: /* otherwise clang complains */ + case RH_BINARY: + return (false); + case RH_OK: + if (j == sz) { + sz = sz * 3 / 2; + p = xreallocarray(p, sz + 3, sizeof(*p)); + } + p[++j].value = h; } - p[++j].value = h; - } + len[i] = j; file[i] = p; + + return (true); } static void @@ -1350,8 +1363,8 @@ fetch(long *f, int a, int b, FILE *lb, int ch, int oldfile, int flags) /* * Hash function taken from Robert Sedgewick, Algorithms in C, 3d ed., p 578. */ -static int -readhash(FILE *f, int flags) +static enum readhash +readhash(FILE *f, int flags, unsigned *hash) { int i, t, space; unsigned sum; @@ -1360,6 +1373,9 @@ readhash(FILE *f, int flags) space = 0; for (i = 0;;) { switch (t = getc(f)) { + case '\0': + if ((flags & D_FORCEASCII) == 0) + return (RH_BINARY); case '\r': if (flags & D_STRIPCR) { t = getc(f); @@ -1387,18 +1403,15 @@ readhash(FILE *f, int flags) continue; case EOF: if (i == 0) - return (0); + return (RH_EOF); /* FALLTHROUGH */ case '\n': break; } break; } - /* - * There is a remote possibility that we end up with a zero sum. - * Zero is used as an EOF marker, so return 1 instead. - */ - return (sum == 0 ? 1 : sum); + *hash = sum; + return (RH_OK); } static int diff --git a/usr.bin/diff/tests/diff_test.sh b/usr.bin/diff/tests/diff_test.sh index c311c3bf2fbc..d96dd8c2a33e 100755 --- a/usr.bin/diff/tests/diff_test.sh +++ b/usr.bin/diff/tests/diff_test.sh @@ -18,6 +18,7 @@ atf_test_case conflicting_format atf_test_case label atf_test_case report_identical atf_test_case non_regular_file +atf_test_case binary simple_body() { @@ -264,6 +265,18 @@ non_regular_file_body() diff --label A --label B -u A B } +binary_body() +{ + # the NUL byte has to be after at least BUFSIZ bytes to trick asciifile() + yes 012345678901234567890123456789012345678901234567890 | head -n 174 > A + cp A B + printf '\n\0\n' >> A + printf '\nx\n' >> B + + atf_check -o inline:"Binary files A and B differ\n" -s exit:1 diff A B + atf_check -o inline:"176c\nx\n.\n" -s exit:1 diff -ae A B +} + atf_init_test_cases() { atf_add_test_case simple @@ -284,4 +297,5 @@ atf_init_test_cases() atf_add_test_case label atf_add_test_case report_identical atf_add_test_case non_regular_file + atf_add_test_case binary }
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202109250841.18P8fTiU018051>