Relay-Version: version B 2.10.3 4.3bsd-beta 6/6/85; site seismo.UUCP Posting-Version: version B 2.10.2 9/3/84; site genrad.UUCP Path: seismo!harvard!talcott!panda!genrad!sources-request From: sources-request@genrad.UUCP Newsgroups: mod.sources Subject: Boyer-Moore based fgrep like program Message-ID: <933@genrad.UUCP> Date: 3 Jul 85 14:24:55 GMT Sender: john@genrad.UUCP Lines: 581 Approved: john@genrad.UUCP Mod.sources: Volume 2, Issue 3 Submitted by: Arnold Robbins The following program may be of some use to people out there. In the usual case of looking for a single string, it is 10% to 20% faster than fgrep. There is no makefile, since everything is in a single source file. Just compile with "cc -O bgrep.c -o bgrep". Enjoy, Arnold Robbins arnold@gatech.{UUCP, CSNET} -------------- cut here ------------------------ #!/bin/sh # This is a shell archive, meaning: # 1. Remove everything above the #!/bin/sh line. # 2. Save the resulting text in a file. # 3. Execute the file with /bin/sh (not csh) to create the files: # bgrep.c # bgrep.1 # This archive created: Mon Jul 1 10:52:37 1985 # By: Arnold Robbins (Pr1mebusters!) export PATH; PATH=/bin:$PATH echo shar: extracting "'bgrep.c'" '(8635 characters)' if test -f 'bgrep.c' then echo shar: over-writing existing file "'bgrep.c'" fi cat << \SHAR_EOF > 'bgrep.c' /* bgrep --- grep using Boyer-Moore pattern matcher */ /* * All the wrapper code (argument and file handling, printing, etc.) by * Arnold Robbins, gatech!arnold * * Boyer-Moore pattern matching, coded by Roy Mongiovi, gatech!gitpyr!roy * and edited some by Arnold Robbins. * * No warranty of suitability for any purpose, either expressed * or implied, is made. */ #include #include #define TRUE 1 #define FALSE 0 #define MAXPATS 120 /* (arbitrary maximum number of patterns */ #define MAXLINE (256 + 1) /* * command line options */ int Allbut = FALSE; /* print lines that don't match pattern */ int Exact = FALSE; /* only print lines that match exactly */ int Countlines = FALSE; /* only print a count of matching lines */ int Listnames = FALSE; /* only list file names that match */ int Numberlines = FALSE; /* print relative line number */ int Silent = FALSE; /* don't print anything, just exit */ int Monocase = FALSE; /* ignore case distinctions */ /* * variables */ long Curline = 0; /* current file input line */ long Lines_matched = 0; /* how many lines matched the pattern */ int Lotsafiles = FALSE; /* are there more than one file? */ int Pat_length[MAXPATS]; /* length of pattern */ int Line_length = 0; /* length of line */ int Couldnt_open_files = FALSE; /* one or more files could not be opened */ int Exit_val = 0; /* return code status */ int Curpat = 0; /* current pattern comparing against */ int Numpats = 0; /* total number of patterns */ char Inbuf[MAXLINE]; /* input buffer */ char Pattern[MAXPATS][MAXLINE]; /* pattern to be matched; init = NULs */ char *Program = NULL; /* program name */ int Argc; /* make argc and argv global */ char **Argv; extern char *basename(); /* leaf file of a pathname */ main (argc, argv, envp) int argc; char **argv, **envp; { register int i, j; char *index(); Program = basename (argv[0]); Argc = argc; Argv = argv; parse_args (); /* deal with command line */ if (Pattern[0][0] == '\0') /* not from -f or -e */ { if (Argv[0] == NULL) /* no string given */ usage (); setpats (Argv[0]); Argc--; Argv++; } for (Curpat = 0; Curpat <= Numpats; Curpat++) { if (Monocase) mapdown (Pattern[Curpat]); Pat_length[Curpat] = strlen (Pattern[Curpat]); initialize (); /* set up necessary tables */ } Lotsafiles = (Argc > 1); /* more than one file left */ if (Argc == 0) /* search stdin */ process ("-"); else for (i = 0; Argv[i] != NULL; i++) process (Argv[i]); if (! Silent && Countlines) printf ("%ld\n", Lines_matched); if (Lines_matched == 0) exit (1); else if (Couldnt_open_files) exit (2); else exit (0); } /* process --- start doing the work on each file */ process (infile) register char *infile; { FILE *fp; int c; int success; long prev_lines_matched = Lines_matched; /* save count */ Curline = 0; /* reset for each file */ if (infile[0] == '-' && infile[1] == '\0') fp = stdin; else if ((fp = fopen (infile, "r")) == NULL) { Couldnt_open_files = TRUE; perror (infile); return; } while (fgets (Inbuf, sizeof Inbuf, fp) != NULL) { Curline++; if (Monocase) mapdown (Inbuf); Line_length = strlen (Inbuf); /* first, throw away rest of a truncated input line */ if (Inbuf[Line_length - 1] != '\n') while ((c = getc (fp)) != '\n') ; else Inbuf[--Line_length] = '\0'; /* newline is there, nuke it */ for (Curpat = 0; Curpat <= Numpats; Curpat++) { if (success = match ()) Lines_matched++; /* do any necessary output */ if (! Silent && ! Countlines && ! Listnames && ((success != FALSE) ^ (Allbut != FALSE))) /* either success, or Allbut, but not both, and not neither */ { if (Lotsafiles) printf ("%s: ", infile); if (Numberlines) printf ("%ld: ", Curline); printf ("%s\n", Inbuf); } } } fclose (fp); if (! Silent && Listnames && prev_lines_matched < Lines_matched) printf ("%s\n", infile); } /* parse_args --- check out command line arguments */ parse_args () { register int i,j; if (Argc == 1) usage (); for (Argc--, Argv++; Argv[0] != NULL && Argv[0][0] == '-'; Argc--, Argv++) { int cheat = FALSE; for (j = 1; Argv[0][j] != '\0'; j++) { switch (Argv[0][j]) { case 'c': Countlines = TRUE; break; case 'e': strcpy (Pattern[0], Argv[1]); Pattern[0][sizeof Pattern[0] - 1] = '\0'; cheat = TRUE; continue; case 'f': patfromfile (Argv[1]); cheat = TRUE; continue; case 'i': case 'y': Monocase = TRUE; break; case 'l': Listnames = TRUE; break; case 'n': Numberlines = TRUE; break; case 's': Silent = TRUE; break; case 'v': Allbut = TRUE; break; case 'x': Exact = TRUE; break; default: usage (); break; } } if (cheat) { cheat = FALSE; Argc--; Argv++; /* boy is this stuff a kludge! */ } } /* check for argument conflicts */ if ( (Silent && (Allbut || Exact || Countlines || Listnames || Numberlines)) || (Allbut && Exact) || (Countlines && Listnames) ) { fprintf (stderr, "%s: argument conflict -- see the man page\n", Program); usage (); /* will exit */ } } /* mapdown --- remove case distinctions in a string */ mapdown (str) register char *str; { register int i; for (i = 0; str[i] != '\0'; i++) if (isupper (str[i])) str[i] = tolower (str[i]); } /* return basename part of a pathname, if '/'s are present */ char *basename (str) register char *str; { register int i = 0; register int j = 0; for (; str[i] != '\0'; i++) if (str[i] == '/') j = i; if (j != 0) return (& str[++j]); else return (str); } /* usage --- print usage message and die */ usage () { fprintf (stderr, "usage: %s [-vxclnisef] [string] [files]\n", Program); exit (2); } /* index --- do index by hand so it'll work on any Unix */ char *index (str, c) char *str, c; { for (; *str; str++) if (*str == c) return (str); return (NULL); } /* patfromfile --- retrieve the pattern from the named file */ patfromfile (infile) char *infile; { register int i, j; register FILE *fp; register int c; if ((fp = fopen (infile, "r")) == NULL || (c = getc (fp)) == EOF) { perror (infile); /* be like standard grep */ exit (2); } else ungetc (c, fp); for (i = 0; fgets (Pattern[i], MAXLINE, fp) != NULL; i++) { if (i >= 120) { fprintf (stderr, "%s: Only %d strings allowed\n", Program, MAXPATS); exit (2); } j = strlen (Pattern[i]); if (Pattern[i][j - 1] == '\n') Pattern[i][--j] = '\0'; } Numpats = i - 1; fclose (fp); } /* setpats --- set up the patterns from a string */ setpats (str) register char *str; { register int i, j; while (*str == '\n' || *str == '\r') str++; for (i = j = 0; *str; str++) { if (*str == '\n') { Pattern[i][j] = '\0'; j = 0; i++; } else Pattern[i][j++] = *str; } Numpats = i; } /* begin magic stuff for Boyer-Moore pattern matching */ int D1[MAXPATS][128]; int D2[MAXPATS][128]; int F[MAXPATS][128]; initialize () { register int i, t; for (i = 0; i < 128; i++) D1[Curpat][i] = Pat_length[Curpat]; for (i = 0; i < Pat_length[Curpat]; i++) D1[Curpat][Pattern[Curpat][i]] = Pat_length[Curpat] - i - 1; for (i = 0; i < Pat_length[Curpat]; i++) D2[Curpat][i] = (Pat_length[Curpat] << 1) - i - 1; for (i = (t = Pat_length[Curpat]) - 1; i >= 0; i--, t--) for (F[Curpat][i] = t; t < Pat_length[Curpat] && Pattern[Curpat][i] != Pattern[Curpat][t]; t = F[Curpat][t]) if (Pat_length[Curpat] - i - 1 < D2[Curpat][t]) D2[Curpat][t] = Pat_length[Curpat] - i - 1; for (i = 0; i <= t; i++) if (Pat_length[Curpat] + t - i < D2[Curpat][i]) D2[Curpat][i] = Pat_length[Curpat] + t - i; } /* match --- do Boyer-Moore pattern search on input buffer */ match () { register int i, j; if (Exact && Pat_length[Curpat] != Line_length) return FALSE; i = Pat_length[Curpat] - 1; while (i < Line_length) { j = Pat_length[Curpat] - 1; while (j >= 0) { if (Inbuf[i] == Pattern[Curpat][j]) i--, j--; else break; } if (j < 0) { /* found a match */ return TRUE; /* * note: if we were going to seach for further matches * on the input line, we would do this: * * i += Pat_length[Curpat] + 1; * * which shifts right by Pat_length[Curpat] + 1 places */ } else { j = (D1[Curpat][Inbuf[i]] >= D2[Curpat][j]) ? D1[Curpat][Inbuf[i]] : D2[Curpat][j]; i += j; /* shift right by j places */ } } return FALSE; } SHAR_EOF echo shar: extracting "'bgrep.1'" '(2419 characters)' if test -f 'bgrep.1' then echo shar: over-writing existing file "'bgrep.1'" fi cat << \SHAR_EOF > 'bgrep.1' .TH BGREP 1 "Georgia Tech" .SH NAME bgrep \- search a file for one or more simple strings .SH SYNOPSIS .B bgrep [ options ] string-list [ files ] .SH DESCRIPTION .I Bgrep (Boyer\-Moore grep) is program patterned after .IR fgrep (1). It uses the Boyer\-Moore string searching algorithm, which actually gets faster as the length of the pattern to be searched for increases. .I Bgrep searches for plain .I strings\^ (separated by newlines in the .I string-list\^ argument), not regular expressions in the style of .IR grep . .PP The following .I options are recognized: .TP .B \-v All lines but those matching are printed. .TP .B \-x (Exact) only lines matched in their entirety are printed. .TP .B \-c Only a count of the matching lines are printed. This is the total count, across all the input files. .TP .BR \-i " or " \-y Ignore case when trying to match a line. Both versions of this option are accepted for compatibility with .I grep on different versions of Unix. .TP .B \-l Only the names of files with matching lines are printed (once), separated by newlines. .TP .B \-n Each line is preceded by its relative line number within the file. .TP .B \-s Silent mode: No output is generated (except error messages). This is useful for just checking the exit status. .TP .BI \-e " string" Same as a simple .I string argument, but useful when the string begins with a \-. .TP .BI \-f " file" The strings to be searched for are read from .IR file . .PP The arguments to the .BR \-e " and " \-f options .I must be given as separate program arguments, i.e. separated by white space. .PP .I Bgrep catches conflicting arguments (e.g. .BR \-v " and " \-x ) and complains. .SH SEE ALSO .IR ed (1), .IR sed (1), .IR grep (1), .IR sh (1) .SH DIAGNOSTICS Exit status 0 if any matches were found, 1 if none, 2 for syntax errors on the command, or if any files could not be opened (even if matches were found). .SH BUGS Input lines and the strings to be searched for are limited to 256 characters. Longer input lines are truncated. .PP .I Bgrep\^ will not search for any more than 120 different strings. .PP Uses the package, which slows it down some. Nonetheless, in the usual case, it is 10% to 20% faster than .IR fgrep (1). .SH AUTHORS Roy Mongiovi (gatech!gitpyr!roy) coded the guts of the Boyer\-Moore algorithm, while Arnold Robbins (gatech!arnold) wrote the code to do all the rest of the work, and the man page. SHAR_EOF # End of shell archive exit 0