/* * Copyright (c) 2024 firk (firk@cantconnect.ru) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The origin of this software must not be misrepresented; you must * not claim that you wrote the original software. * 4. Altered versions in any form must be plainly marked as such, and * must not be misinterpreted as being the original software. * * This software is provided by the author and contributors `as is' * without any express or implied warranty. */ #define _GNU_SOURCE /* needed for glibc memmem(), may be ignored on others */ #define _FILE_OFFSET_BITS 64 #include #include #include #include #include #include #include #include typedef unsigned long long ull; static char const *file, *str, *sizestr, *modestr; static int binmode; static ull size; static char *buf; static char sbuf[1048576]; static size_t bufsize, strsize; static size_t bufbegin, bufofs, buflen; static off_t filepos, prevpos; static int fd, eof; static void do_all(void); static int refill_input(void); static void save_blob(); static int is_binchar(char c) { return (c==127 || c>=0 && c<=6 || c>=14 && c!=27 && c<=31); } static char const lic[] = "rawsearch 2024-02-10\n\n" " Copyright (c) 2024 firk (firk@cantconnect.ru)\n\n" " Redistribution and use in source and binary forms, with or without\n" " modification, are permitted provided that the following conditions\n" " are met:\n" " 1. Redistributions of source code must retain the above copyright\n" " notice, this list of conditions and the following disclaimer.\n" " 2. Redistributions in binary form must reproduce the above copyright\n" " notice, this list of conditions and the following disclaimer in the\n" " documentation and/or other materials provided with the distribution.\n" " 3. The origin of this software must not be misrepresented; you must\n" " not claim that you wrote the original software.\n" " 4. Altered versions in any form must be plainly marked as such, and\n" " must not be misinterpreted as being the original software.\n\n" " This software is provided by the author and contributors `as is'\n" " without any express or implied warranty.\n\n"; int main(unsigned int argc, char **argv) { unsigned int j; char const *a; size_t k; char *t; for(j=1; j str= [mode=] [size=]\n"); return -1; } if(!modestr) binmode = -1; strsize = strlen(str); if(!strsize) { fprintf(stderr, "empty string!\n"); goto usage; } for(k=0; str[k]; k++) if(is_binchar(str[k])) { if(!binmode) { fprintf(stderr, "string contains non-text bytes but mode=text given\n"); return -1; } binmode = 1; } if(binmode==-1) binmode = 0; k = strsize*4; if(k/4!=strsize) { fprintf(stderr, "string too long!\n"); goto usage; } bufsize = 1048576; k = strsize*2; while(bufsize=1); if(avl>=strsize) { if(pos=memmem(bb, avl, str, strsize)) return pos; bb = bb+avl-(strsize-1); avl = strsize-1; } while(avl) { if(!bcmp(bb,str,avl)) return bb; bb++; avl--; } return NULL; } static void do_all(void) { size_t j, bufavl, bufpos, foundoffs; unsigned last_gb, cur_gb; char *found; last_gb = (unsigned)-1; while(1) { cur_gb = ((((ull)filepos) >> 20)*10) >> 10; if(cur_gb!=last_gb) { last_gb=cur_gb; fprintf(stderr, "\r%u.%u GB... ", cur_gb/10, cur_gb%10); } refill_input(); assert(eof || buflen==bufsize); bufpos = (bufbegin+bufofs)%bufsize; bufavl = buflen-bufofs; if(bufavl>bufsize-bufpos) bufavl = bufsize-bufpos; found = (char*)findstr(buf+bufpos, bufavl); if(!found) { /* the string surely not starts in the given range */ bufofs += bufavl; if(bufofs==buflen && eof) return; if(bufofs>bufsize/2) { j = bufofs - bufsize/2; filepos += j; bufbegin = (bufbegin+j)%bufsize; bufofs -= j; buflen -= j; } continue; } foundoffs = found-(buf+bufpos); assert(foundoffs=strsize) { /* surely found */ save_blob(); continue; } if(strsize>buflen-bufofs) { /* bufavl==buflen-bufofs here too */ if(eof) return; if(bufofs>bufsize/2) { j = bufofs - bufsize/2; filepos += j; bufbegin = (bufbegin+j)%bufsize; bufofs -= j; buflen -= j; } continue; /* get more data and search again */ } assert(buflen-bufofs>bufavl && bufavl==bufsize-bufpos); if(!bcmp(str+bufavl, buf, strsize-bufavl /* <= buflen-bufofs-bufavl = (bufbegin+buflen-bufsize) = (bufbegin+buflen)%bufsize */ )) { save_blob(); continue; } /* fail */ bufofs++; } } static int refill_input(void) { size_t bufpos, csz; ssize_t rsz; int w; assert(bufbegin=bufbegin)?(bufsize-bufpos):(bufbegin-bufpos); rsz = pread(fd, buf+bufpos, csz, filepos+buflen); if(rsz<0) { fprintf(stderr, "read (pos=%llu size=%llu) error %d (%s)\n", (ull)(filepos+buflen), (ull)csz, errno, strerror(errno)); eof=1; } else if(!rsz) { fprintf(stderr, "reached end-of-file at pos=%llu\n", (ull)(filepos+buflen)); eof=1; } else { assert(csz>=(size_t)rsz); buflen += rsz; w = 1; } } assert(bufbegin=(ssize_t)csz) csz/=2; while((rsz = pread(fd1, b, csz, o))<0) if(errno!=EINTR) return -1; if(!rsz) return 0; assert(rsz>=1 && rsz<=(ssize_t)csz); o += rsz; b += rsz; n -= rsz; } return 1; } static int write_all(int fd1, char const *b, size_t n) { size_t csz; ssize_t wsz; while(csz=n) { while(0>=(ssize_t)csz) csz/=2; while((wsz = write(fd1, b, csz))<0) if(errno!=EINTR) return -1; assert(wsz>=1 && wsz<=(ssize_t)csz); b += wsz; n -= wsz; } return 0; } static int tfd; static void tfd_write(char const *b, size_t n) { if(tfd>=0 && write_all(tfd,b,n)<0) { fprintf(stderr, "recovered file write error %d (%s)\n", errno, strerror(errno)); close(tfd); tfd=-1; } } /* input: * filepos+bufofs = position where the string found * vars should be consistent on return: * filepos: file position matches buf[bufbegin] byte * eof * buf[], bufbegin, bufofs, buflen * bufofs should point to byte after last saved one */ static void save_blob(void) { char tfn[200]; size_t j, csz; off_t startpos, testpos, pos, cszo; int r; fprintf(stderr, "found at byte %llu ", (ull)(filepos+bufofs)); /* search for beginning */ j = bufofs; while(j && !is_binchar(buf[(bufbegin+j-1)%bufsize])) j--; if(j) { startpos = filepos+j; } else { startpos = filepos; while(startpos>prevpos) { if(startpos-prevpos>1048576) { testpos = startpos-1048576; csz=1048576; } else { testpos = prevpos; csz = startpos-testpos; } r = pread_all(fd, sbuf, csz, testpos); if(r<0) { fprintf(stderr, "read (pos=%llu size=%llu) error %d (%s)\n", (ull)testpos, (ull)csz, errno, strerror(errno)); /* TODO? */ exit(-1); } if(!r) { fprintf(stderr, "reached unexpected end-of-file at pos=%llu\n", (ull)testpos); /* TODO? */ exit(-1); } for(j=0; j=testpos && startpos+j==testpos+csz && j<=csz && startpos+j<=filepos); tfd_write(sbuf+(csz-j), j); pos = j; while((testpos=startpos+pos)=1); if(cszo>1048576) csz = 1048576; else csz = cszo; r = pread_all(fd, sbuf, csz, testpos); if(r<0) { fprintf(stderr, "read (pos=%llu size=%llu) error %d (%s)\n", (ull)testpos, (ull)csz, errno, strerror(errno)); /* TODO? */ exit(-1); } if(!r) { fprintf(stderr, "reached unexpected end-of-file at pos=%llu\n", (ull)testpos); /* TODO? */ exit(-1); } tfd_write(sbuf, csz); pos += csz; } } else { csz = startpos-filepos; assert(csz<=bufofs); filepos += csz; bufbegin = (bufbegin+csz)%bufsize; buflen -= csz; bufofs -= csz; pos = 0; } assert(startpos+pos==filepos); while(bufofs=strsize); csz = bufsize-bufbegin; if(bufofs<=csz) { tfd_write(buf+bufbegin, bufofs); } else { tfd_write(buf+bufbegin, csz); tfd_write(buf, bufofs-csz); } pos += bufofs; filepos += bufofs; bufbegin = (bufbegin+bufofs)%bufsize; buflen -= bufofs; bufofs = 0; while(!eof && !buflen) { bufbegin = 0; /* buffer is empty anyway, adjust it startpos to zero for ease */ refill_input(); while(bufofs=0 && write_all(tfd, buf, bufofs)<0) { fprintf(stderr, "recovered file write error %d (%s)\n", errno, strerror(errno)); close(tfd); tfd=-1; } pos += bufofs; filepos += bufofs; bufbegin = bufofs; buflen -= bufofs; bufofs = 0; } fprintf(stderr, "blob size is %llu bytes\n", (ull)pos); if(tfd>=0) close(tfd); return; }