Path: seismo!harvard!talcott!panda!sources-request From: sources-request@panda.UUCP Newsgroups: mod.sources Subject: Software similarity tester for C programs Message-ID: <1398@panda.UUCP> Date: 11 Feb 86 12:28:12 GMT Sender: jpn@panda.UUCP Lines: 1795 Approved: jpn@panda.UUCP Mod.sources: Volume 3, Issue 119 Submitted by: Dick Grune The enclosed shar-archive contains a program that will detect stretches in C-programs that look similar (or are just plain equal). This is useful for finding "borrowed" soft-ware or for isolating possible subroutines in large software systems. It is very fast and gives results. We have been using it for about half a year now. Dick Grune Vrije Universiteit de Boelelaan 1081 1081 HV Amsterdam the Netherlands ..!mcvax!vu44!dick dick@vu44.UUCP : This is a shar archive. Extract with sh, not csh. : This archive ends with exit, so do not worry about trailing junk. : --------------------------- cut here -------------------------- PATH=/bin:/usr/bin echo Extracting \R\E\A\D\_\M\E sed 's/^X//' > \R\E\A\D\_\M\E << '+ END-OF-FILE '\R\E\A\D\_\M\E XSat Jan 11 15:47:16 1986 X XThis program tests for similar (or equal) stretches in one or more C-programs. XSee sim.1 X XTo compile, call "make", which will generate one executable called sim, and Xwill run two small tests to show sample output. X XTo install, examine the Makefile and reset BINDIR and MANDIR to sensible Xpaths, and call "make install" X XTo change the default run size or the page width, adjust the file params.h Xand recompile. X XTo add another language X, write a file X.l along the lines of clang.l, Xextend the Makefile and recompile. All knowledge about C in located in Xclang.l; the rest of the programs expect each C lexical unit to be a single Xcharacter. X X Dick Grune X Vrije Universiteit X de Boelelaan 1081 X 1081 HV Amsterdam X the Netherlands X ..!mcvax!vu44!dick X dick@vu44.UUCP + END-OF-FILE READ_ME chmod 'u=rw,g=r,o=r' \R\E\A\D\_\M\E set `sum \R\E\A\D\_\M\E` sum=$1 case $sum in 26812) :;; *) echo 'Bad sum in '\R\E\A\D\_\M\E >&2 esac echo Extracting \M\a\k\e\f\i\l\e sed 's/^X//' > \M\a\k\e\f\i\l\e << '+ END-OF-FILE '\M\a\k\e\f\i\l\e X# This file is part of the software similarity tester SIM. X# Written by Dick Grune, Vrije Universiteit, Amsterdam. X# X XBINDIR = /user1/dick/bin# # where to put the binary (sim) XMANDIR = /user1/dick/man# # where to put the manual page (sim.1) X X# X# Each module (set of programs that together perform some function) X# has the following sets defined for it: X# FLS all files of that module, for, e.g., X# printing, sharring, inventory, etc. X# SRC the source files, from which other files derive X# CFS the C-files, from which the object files derive X# OBJ objects X# GRB garbage produced by compiling the module X# X# (This is a feeble attempt at software-engineering a Makefile.) X# X XTEST = pass3.c# # guinea pig X Xall: sim.res lang.res X X# The C Language module XCLN_OBJ = clang.o XCLN_CFS = clang.c XCLN_SRC = clang.l XCLN_FLS = $(CLN_SRC) X Xclang.c: clang.l X lex -t clang.l >clang.c XCLN_GRB = clang.c X X# Common modules: XCOM_OBJ = stream.o idf.o buff.o error.o XCOM_CFS = stream.c idf.c buff.c error.c XCOM_SRC = $(COM_CFS) XCOM_FLS = stream.h idf.h buff.h $(COM_SRC) X X# The top-package: XTOP_OBJ = top.o XTOP_CFS = top.c XTOP_SRC = $(TOP_CFS) XTOP_FLS = top.p top.g top.h $(TOP_SRC) X X# The similarity tester: XSIM_OBJ = sim.o pass1.o hash.o compare.o add_run.o pass2.o pass3.o XSIM_CFS = sim.c pass1.c hash.c compare.c add_run.c pass2.c pass3.c XSIM_SRC = $(SIM_CFS) XSIM_FLS = params.h text.h debug.h $(SIM_SRC) X XSIM = $(CLN_OBJ) $(COM_OBJ) $(TOP_OBJ) $(SIM_OBJ) XCFS = $(CLN_CFS) $(COM_CFS) $(TOP_CFS) $(SIM_CFS) X Xsim: $(SIM) X $(CC) $(SIM) -o sim X Xsim.res: sim $(TEST) X sim -hr 20 $(TEST) X Xlint: $(CFS) X lint -xa $(CFS) XSIM_GRB = sim X X# The language streamliner as a main program: XSTR_OBJ = main.o XSTR_CFS = main.c XSTR_SRC = $(STR_CFS) XSTR_FLS = $(STR_SRC) X XLANG_CFS = $(CLN_CFS) $(STR_CFS) $(COM_CFS) XLANG_OBJ = $(CLN_OBJ) $(STR_OBJ) $(COM_OBJ) Xlang: $(LANG_OBJ) X $(CC) $(LANG_OBJ) -o lang X Xlang.res: lang $(TEST) X lang -1 $(TEST) >lang1.res X lang -2 $(TEST) >lang2.res X wc lang[12].res $(TEST) XLANG_GRB = lang lang1.res lang2.res X Xlang.lint: X lint -xa $(LANG_CFS) X X# various other entries XFLS = READ_ME Makefile sim.1 \ X $(COM_FLS) $(TOP_FLS) $(SIM_FLS) $(STR_FLS) $(CLN_FLS) XSRC = $(COM_SRC) $(TOP_SRC) $(SIM_SRC) $(STR_SRC) $(CLN_SRC) XOBJ = $(COM_OBJ) $(TOP_OBJ) $(SIM_OBJ) $(STR_OBJ) $(CLN_OBJ) X Xprint: $(FLS) X pr $(FLS) >print XPRINT_GRB = print X Xshar: $(FLS) X shar $(FLS) >shar X Xfiles: Makefile X ls $(FLS) >files X Xcchk: X cchk $(CFS) X Xsimsim: sim X sim -hfr 20 $(SRC) X Xsimsimx: sim X sim -hfxr 20 $(SRC) X Xtags: $(SRC) X ctags $(SRC) X Xinstall: $(BINDIR)/sim $(MANDIR)/sim.1 X X$(BINDIR)/sim: sim X cp sim $(BINDIR)/sim X X$(MANDIR)/sim.1: sim.1 X cp sim.1 $(MANDIR)/sim.1 X Xclean: X rm -f $(OBJ) $(CLN_GRB) $(SIM_GRB) $(LANG_GRB) $(PRINT_GRB) \ X a.out core X X#------------------------------------------------------------------------ Xadd_run.o: buff.h text.h top.p top.h Xbuff.o: buff.h Xclang.o: idf.h stream.h Xcompare.o: buff.h text.h top.p top.h Xhash.o: buff.h text.h Xidf.o: idf.h Xpass1.o: buff.h text.h Xpass2.o: text.h top.p top.h debug.h Xpass3.o: params.h text.h top.p top.h debug.h buff.h Xsim.o: params.h Xstream.o: stream.h Xtop.o: text.h top.p top.h top.g + END-OF-FILE Makefile chmod 'u=rw,g=r,o=r' \M\a\k\e\f\i\l\e set `sum \M\a\k\e\f\i\l\e` sum=$1 case $sum in 32570) :;; *) echo 'Bad sum in '\M\a\k\e\f\i\l\e >&2 esac echo Extracting \s\i\m\.\1 sed 's/^X//' > \s\i\m\.\1 << '+ END-OF-FILE '\s\i\m\.\1 X.\" This file is part of the software similarity tester SIM. X.\" Written by Dick Grune, Vrije Universiteit, Amsterdam. X.\" X.TH SIM I X.SH NAME Xsim \- find similarities in C-files X.SH SYNOPSIS X.B sim X[ X.B \-[fns] X.BI \-r N X] Xfile ... X.SH DESCRIPTION X.I Sim Xreads the C-files X.I file ... Xand looks for pieces of text that are similar; two pieces of C-text Xare similar if they only differ in layout, comment, identifiers and Xthe contents of numbers, strings and characters. If any runs Xof sufficient length Xare found, they are reported on standard output; the default length Xminimum is 24, but can be reset by the X.BR \-r -parameter. X.PP XThe program can be used for finding copied pieces of code in Xpurportedly unrelated programs (with the X.BR \-s -flag), Xor for finding accidentally duplicated code in larger projects X(without the X.BR \-s -flag Xbut with the X.BR \-f -flag). X.PP XSince it reads the files several times, it cannot read from standard input. X.PP XThere are the following options: X.TP X.B \-f XRuns are restricted to pieces with balancing parentheses, to isolate Xpotential functions. X.TP X.B \-n XSimilarities found are only summarized, not displayed. X.TP X.B \-s XThe contents of a file are not compared to itself (\-s = not self). X.PP XThe matching process uses a hash table so that tens of thousands of Xlines are processed in a few minutes; if, however, there is not Xenough memory for the table, the matching process uses sequential Xsearch, which can take hours. X.SH AUTHOR XDick Grune, Vrije Universiteit, Amsterdam. X.SH BUGS XStrong periodicity in the input text (like a table of X.I N Xalmost identical lines) causes problems. X.I Sim Xtries to cope with this but cannot avoid giving appr. X.I log N Xmessages about it. The best advice is still to take the offending Xfiles out of the game. + END-OF-FILE sim.1 chmod 'u=rw,g=r,o=r' \s\i\m\.\1 set `sum \s\i\m\.\1` sum=$1 case $sum in 56000) :;; *) echo 'Bad sum in '\s\i\m\.\1 >&2 esac echo Extracting \s\t\r\e\a\m\.\h sed 's/^X//' > \s\t\r\e\a\m\.\h << '+ END-OF-FILE '\s\t\r\e\a\m\.\h X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X/* X Interface between the language-dependent lex module and X the stream module. X*/ X X/* communication variables */ Xextern int lex_no; /* pass 1: return stream of C condensed chars */ X /* pass 2: return C-char/ASCII-char position X pairs at each \n */ Xextern int lex_ch; /* condensed C-char produced by pass 1 */ Xextern unsigned int lex_ch_cnt; X /* C-char position reported at each \n by pass 2 */ Xextern long lex_ls_pos; /* lseek position reported at each \n by pass 2 */ X X/* #defines for the lex module */ X#define cput(ch) if (lex_ch_cnt++, lex_no == 1) \ X {lex_ch = ch; return 1;} else X#define c_eol() if (lex_no == 2) return 1; else X#define count() lex_ls_pos += yyleng + END-OF-FILE stream.h chmod 'u=rw,g=r,o=r' \s\t\r\e\a\m\.\h set `sum \s\t\r\e\a\m\.\h` sum=$1 case $sum in 61866) :;; *) echo 'Bad sum in '\s\t\r\e\a\m\.\h >&2 esac echo Extracting \i\d\f\.\h sed 's/^X//' > \i\d\f\.\h << '+ END-OF-FILE '\i\d\f\.\h X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X/* the struct for keywords etc. */ Xstruct idf { X char *id_tag; /* an interesting identifier */ X char id_tr; /* with its one-character translation */ X}; X X#define idf2char(s,l) findidf(s, l, sizeof l / sizeof l[0]) + END-OF-FILE idf.h chmod 'u=rw,g=r,o=r' \i\d\f\.\h set `sum \i\d\f\.\h` sum=$1 case $sum in 29562) :;; *) echo 'Bad sum in '\i\d\f\.\h >&2 esac echo Extracting \b\u\f\f\.\h sed 's/^X//' > \b\u\f\f\.\h << '+ END-OF-FILE '\b\u\f\f\.\h X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X Xextern char *buff; Xextern unsigned int text_length(); + END-OF-FILE buff.h chmod 'u=rw,g=r,o=r' \b\u\f\f\.\h set `sum \b\u\f\f\.\h` sum=$1 case $sum in 20348) :;; *) echo 'Bad sum in '\b\u\f\f\.\h >&2 esac echo Extracting \s\t\r\e\a\m\.\c sed 's/^X//' > \s\t\r\e\a\m\.\c << '+ END-OF-FILE '\s\t\r\e\a\m\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include X#include "stream.h" X X/* imports from the lex module */ Xextern int yylex(); Xextern yystart(); Xextern FILE *yyin; X Xint lex_no; /* pass 1: return stream of C condensed chars */ X /* pass 2: return C-char/ASCII-char position X pairs at each \n X */ X Xint lex_ch; /* condensed C-char produced by pass 1 */ Xunsigned int lex_ch_cnt;/* C-char position reported at each \n by pass 2 */ Xlong lex_ls_pos; /* lseek position reported at each \n by pass 2 */ X Xint XOpenStream(pass, fname) X char *fname; X{ X lex_no = pass; X lex_ch_cnt = 0; X lex_ls_pos = 0L; X X /* start the lex machine */ X yyin = fopen(fname, "r"); X yystart(); X return yyin != NULL; X} X Xint XNextChar(cp) /* lex_no must be 1 */ X char *cp; X{ X if (!yylex()) X return -1; X *cp = lex_ch; X return 0; X} X Xint XNextPair(ccp, lsp) /* lex_no must be 2 */ X unsigned int *ccp; X long *lsp; X{ X if (!yylex()) X return -1; X *ccp = lex_ch_cnt; X *lsp = lex_ls_pos; X return 0; X} X XCloseStream() { X fclose(yyin); X} + END-OF-FILE stream.c chmod 'u=rw,g=r,o=r' \s\t\r\e\a\m\.\c set `sum \s\t\r\e\a\m\.\c` sum=$1 case $sum in 24424) :;; *) echo 'Bad sum in '\s\t\r\e\a\m\.\c >&2 esac echo Extracting \i\d\f\.\c sed 's/^X//' > \i\d\f\.\c << '+ END-OF-FILE '\i\d\f\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include "idf.h" X Xint Xfindidf(str, list, size) X char *str; X struct idf list[]; X{ X int first = 0; X int last = size - 1; X X while (first < last) { X int middle = (first + last) / 2; X X if (strcmp(str, list[middle].id_tag) > 0) X first = middle + 1; X else last = middle; X } X return strcmp(str, list[first].id_tag) == 0 ? list[first].id_tr : -1; X} + END-OF-FILE idf.c chmod 'u=rw,g=r,o=r' \i\d\f\.\c set `sum \i\d\f\.\c` sum=$1 case $sum in 60560) :;; *) echo 'Bad sum in '\i\d\f\.\c >&2 esac echo Extracting \b\u\f\f\.\c sed 's/^X//' > \b\u\f\f\.\c << '+ END-OF-FILE '\b\u\f\f\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include "buff.h" X X#define BFSIZE 10000 X Xextern char *malloc(), *realloc(), *calloc(); X Xchar *buff; /* to be filled by malloc */ Xstatic unsigned int buff_size; /* size of buffer at this moment */ Xstatic unsigned int bfree; /* next free position in array buff[] */ X Xinit_buff() { X /* Allocate the text buffer */ X buff = malloc(buff_size = BFSIZE); X if (!buff) X error("out of space"); X bfree = 1; /* don't use position 0 */ X} X Xstore(ch) { X if (bfree == buff_size) { X buff = realloc(buff, buff_size += BFSIZE); X if (!buff || buff_size < bfree) { X /* overflow */ X error("out of space"); X } X } X buff[bfree++] = ch; X} X Xunsigned int Xtext_length() { X return bfree; X} + END-OF-FILE buff.c chmod 'u=rw,g=r,o=r' \b\u\f\f\.\c set `sum \b\u\f\f\.\c` sum=$1 case $sum in 14186) :;; *) echo 'Bad sum in '\b\u\f\f\.\c >&2 esac echo Extracting \e\r\r\o\r\.\c sed 's/^X//' > \e\r\r\o\r\.\c << '+ END-OF-FILE '\e\r\r\o\r\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include X Xerror(msg) X char *msg; X{ X fprintf(stderr, "sim: %s\n", msg); X exit(1); X} + END-OF-FILE error.c chmod 'u=rw,g=r,o=r' \e\r\r\o\r\.\c set `sum \e\r\r\o\r\.\c` sum=$1 case $sum in 10416) :;; *) echo 'Bad sum in '\e\r\r\o\r\.\c >&2 esac echo Extracting \t\o\p\.\p sed 's/^X//' > \t\o\p\.\p << '+ END-OF-FILE '\t\o\p\.\p X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X/* These are the parameters with which to instantiate top.g X*/ X#define TTSIZE 100 /* the TTSIZE best objects */ X#define TTTYPE struct run /* the type of the object */ X#define TTBETTER longer /* how to compare objects */ + END-OF-FILE top.p chmod 'u=rw,g=r,o=r' \t\o\p\.\p set `sum \t\o\p\.\p` sum=$1 case $sum in 20036) :;; *) echo 'Bad sum in '\t\o\p\.\p >&2 esac echo Extracting \t\o\p\.\g sed 's/^X//' > \t\o\p\.\g << '+ END-OF-FILE '\t\o\p\.\g X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X/* This is a generic package for keeping a top-10 list of objects of X/* formal type. X/* X/* Specification in Ada-style: X/* generic -- 3 formals,supplied by #define X/* TTSIZE: int; -- the size of the top-N list X/* type TTTYPE is private; -- the type of the objects X/* with function int TTBETTER(ip, jp) TTTYPE *ip, *jp; X/* -- 1 if object *ip better than X/* -- object *jp, 0 otherwise X/* package TOP is -- reflected in top.h X/* function InitTop(); -- clears the list X/* function InsertTop(obj) TTTYPE *obj; X/* -- accepts a (pointer to) an object X/* -- a generator yields the objects in best-to-worst order: X/* type TopGen is private; -- to declare the generator X/* function OpenTop(tg) TopGen *tg;-- starts the generator X/* function TTTYPE *NextTop(tg) TopGen *tg; X/* -- yields next object, moves generator X/* NoObject: constant (TTTYPE*); -- yielded at end-of-list X/* function CloseTop(tg) TopGen *tg;-- stops the generator X/* end TOP -- */ X/* The application of this file must be preceded by X/* #include "top.h", which defines the interface, and by X/* #include "top.p", which defines the parameters of the instantiation. X X/* package body TOP is -- */ Xstatic TTTYPE val[TTSIZE]; Xstatic TTTYPE *list[TTSIZE]; Xstatic int cnt; X XInitTop() { X cnt = 0; X} X XInsertTop(obj) register TTTYPE *obj; { X register int i; X X if (cnt < TTSIZE) { /* there is still room */ X list[cnt] = &val[cnt]; X val[cnt++] = *obj; X } X else X if (TTBETTER(obj, list[TTSIZE-1])) { X /* preferable to worst in set */ X *list[TTSIZE-1] = *obj; X } X else return; /* we're not interested */ X X for (i = cnt-2; i >= 0 && TTBETTER(list[i+1], list[i]); i--) { X register TTTYPE *jp = list[i+1]; X list[i+1] = list[i]; X list[i] = jp; X } X} X XOpenTop(tg) register TopGen *tg; { X *tg = 0; X} X XTTTYPE * XNextTop(tg) register TopGen *tg; { X return *tg >= cnt ? NoObject : list[(*tg)++]; X} X XCloseTop(tg) register TopGen *tg; { X *tg = TTSIZE; X} X/* end TOP -- */ + END-OF-FILE top.g chmod 'u=rw,g=r,o=r' \t\o\p\.\g set `sum \t\o\p\.\g` sum=$1 case $sum in 62947) :;; *) echo 'Bad sum in '\t\o\p\.\g >&2 esac echo Extracting \t\o\p\.\h sed 's/^X//' > \t\o\p\.\h << '+ END-OF-FILE '\t\o\p\.\h X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X/* This is the public interface of top.g, a generic package for keeping X/* a top-10 list of objects of formal type. X/* X/* The application of this file must be preceded by X/* #include "top.p", which defines the parameters of the instantiation. X*/ X Xextern InitTop(); Xextern InsertTop(); Xtypedef int TopGen; Xextern OpenTop(); Xextern TTTYPE *NextTop(); X#define NoObject ((TTTYPE*)0) Xextern CloseTop(); + END-OF-FILE top.h chmod 'u=rw,g=r,o=r' \t\o\p\.\h set `sum \t\o\p\.\h` sum=$1 case $sum in 48552) :;; *) echo 'Bad sum in '\t\o\p\.\h >&2 esac echo Extracting \t\o\p\.\c sed 's/^X//' > \t\o\p\.\c << '+ END-OF-FILE '\t\o\p\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include "text.h" X#include "top.p" X#include "top.h" X X#ifndef lint /* lint won't take this define ?!?!?! */ X#define longer(r0,r1) (r0->rn_quality > r1->rn_quality) X X#else Xstatic int Xlonger(r0, r1) struct run *r0, *r1; { X return r0->rn_quality > r1->rn_quality; X} X#endif lint X X/* Instantiate top.g */ X#include "top.g" + END-OF-FILE top.c chmod 'u=rw,g=r,o=r' \t\o\p\.\c set `sum \t\o\p\.\c` sum=$1 case $sum in 10035) :;; *) echo 'Bad sum in '\t\o\p\.\c >&2 esac echo Extracting \p\a\r\a\m\s\.\h sed 's/^X//' > \p\a\r\a\m\s\.\h << '+ END-OF-FILE '\p\a\r\a\m\s\.\h X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#define MIN_RUN 24 /* default minimum size X of interesting run X */ X#define PAGE_WIDTH 80 + END-OF-FILE params.h chmod 'u=rw,g=r,o=r' \p\a\r\a\m\s\.\h set `sum \p\a\r\a\m\s\.\h` sum=$1 case $sum in 36700) :;; *) echo 'Bad sum in '\p\a\r\a\m\s\.\h >&2 esac echo Extracting \t\e\x\t\.\h sed 's/^X//' > \t\e\x\t\.\h << '+ END-OF-FILE '\t\e\x\t\.\h X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X Xstruct text { X char *tx_fname; /* the file name */ X int tx_needed; /* set if file plays a role in final output */ X unsigned int tx_start; /* positions in buff for the text */ X unsigned int tx_limit; X}; X Xstruct chunk { X /* a chunk of text in various representations */ X struct text *ch_text; /* a pointer to the file text */ X unsigned int ch_st_ch; /* first in chunk, counted in C-chars */ X unsigned int ch_lm_ch; /* first not in chunk */ X long ch_st_ls; /* same in lseek positions */ X long ch_lm_ls; X unsigned int ch_st_nl; /* same in line numbers */ X unsigned int ch_lm_nl; X}; X Xstruct run { /* a 'run' of coincident chars */ X struct chunk rn_cn0; /* chunk in left file */ X struct chunk rn_cn1; /* chunk in right file */ X unsigned int rn_quality; X}; + END-OF-FILE text.h chmod 'u=rw,g=r,o=r' \t\e\x\t\.\h set `sum \t\e\x\t\.\h` sum=$1 case $sum in 43449) :;; *) echo 'Bad sum in '\t\e\x\t\.\h >&2 esac echo Extracting \d\e\b\u\g\.\h sed 's/^X//' > \d\e\b\u\g\.\h << '+ END-OF-FILE '\d\e\b\u\g\.\h X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#define DEBUG 0 + END-OF-FILE debug.h chmod 'u=rw,g=r,o=r' \d\e\b\u\g\.\h set `sum \d\e\b\u\g\.\h` sum=$1 case $sum in 64324) :;; *) echo 'Bad sum in '\d\e\b\u\g\.\h >&2 esac echo Extracting \s\i\m\.\c sed 's/^X//' > \s\i\m\.\c << '+ END-OF-FILE '\s\i\m\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include "params.h" X Xint min_run_size = MIN_RUN; X Xchar options[128]; /* for various, extensible flags */ X Xmain(argc, argv) X char *argv[]; X{ X argv++, argc--; /* skip program name */ X X while (argc > 0 && argv[0][0] == '-') { X char *par = &argv[0][1]; X X while (*par) { X switch (*par) { X case 'r': X min_run_size = atoi(argv[1]); X argc--, argv++; X break; X default: X options[*par]++; X break; X } X par++; X } X argc--, argv++; X } X if (min_run_size == 0) X error("Minimum run size equals 0"); X X init_buff(); X X /* Read the files */ X pass1(argv, argc); X X /* Set up the hash table */ X make_hash(); X X /* Compare various files */ X compare(); X X /* Delete hash table */ X free_hash(); X X /* Find positions of found similarities */ X pass2(); X X /* Print the similarities */ X pass3(); X return 0; X} + END-OF-FILE sim.c chmod 'u=rw,g=r,o=r' \s\i\m\.\c set `sum \s\i\m\.\c` sum=$1 case $sum in 42365) :;; *) echo 'Bad sum in '\s\i\m\.\c >&2 esac echo Extracting \p\a\s\s\1\.\c sed 's/^X//' > \p\a\s\s\1\.\c << '+ END-OF-FILE '\p\a\s\s\1\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include "buff.h" X#include "text.h" X Xextern char *calloc(); X Xstruct text *text; /* to be filled in by calloc */ Xint ntexts; /* number of text records */ X Xpass1(argv, argc) X char *argv[]; X{ X int n; X X /* allocate the array of text descriptors */ X ntexts = argc; X text = (struct text *)calloc((unsigned)ntexts, sizeof (struct text)); X if (!text) X error("Too many files"); X X /* read the files */ X for (n = 0; n < ntexts; n++) { X char *fname = argv[n]; X struct text *txt = &text[n]; X char ch; X X printf("File %s: ", fname); X X txt->tx_fname = fname; X txt->tx_start = txt->tx_limit = text_length(); X if (!OpenStream(1, txt->tx_fname)) { X printf("cannot open\n"); X OpenStream(1, "/dev/null"); X } X while (NextChar(&ch) == 0) X store(ch); X CloseStream(); X X txt->tx_limit = text_length(); X printf("%u C-units\n", txt->tx_limit - txt->tx_start); X } X printf("Total: %u C-units\n", text_length() - 1); X printf("\n"); X} + END-OF-FILE pass1.c chmod 'u=rw,g=r,o=r' \p\a\s\s\1\.\c set `sum \p\a\s\s\1\.\c` sum=$1 case $sum in 58561) :;; *) echo 'Bad sum in '\p\a\s\s\1\.\c >&2 esac echo Extracting \h\a\s\h\.\c sed 's/^X//' > \h\a\s\h\.\c << '+ END-OF-FILE '\h\a\s\h\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include "buff.h" X#include "text.h" X Xextern char *calloc(); X Xextern char options[]; Xextern int ntexts; Xextern struct text *text; Xextern int min_run_size; X Xstatic int hash_code(); Xstatic print_hash(); X X#define N_HASH 10639 /* any suitable prime */ X Xunsigned int *hash_table; /* to be filled by malloc() */ X X/* to judge the quality of the hash code */ Xstatic tally_right = 0, tally_wrong = 0; Xstatic tally_hash(), print_tally(); X Xmake_hash() { X unsigned int last[N_HASH]; X /* last[i] is the index of the latest char with hash_code i, X or 0 if there is none. X */ X int n; X X for (n = 0; n < N_HASH; n++) X last[n] = 0; X X hash_table = (unsigned int *) X calloc(text_length(), sizeof (unsigned int)); X if (options['x']) X hash_table = 0; X if (!hash_table) { X printf(">>> Not enough memory for the hash table, "); X printf("this is going to take time!\n\n"); X return; X } X X for (n = 0; n < ntexts; n++) { X struct text *txt = &text[n]; X unsigned int j; X X for ( X j = txt->tx_start; X j < txt->tx_limit - min_run_size + 1; X j++ X ) { X int h = hash_code(&buff[j]); X X if (last[h]) { X hash_table[last[h]] = j; X if (options['h']) X tally_hash(last[h], j); X } X last[h] = j; X } X } X if (options['h']) X print_tally(); X if (options['H']) X print_hash(); X} X Xstatic int Xhash_code(p) X char *p; X{ X /* hash_code(p) returns the hash code of the min_run_size first X characters starting at p; caller guarantees that there X are at least min_run_size chars. X */ X int h = 0; X int i; X X for (i = 0; i < min_run_size; i++) X h = ((h << 1) + *p++) % N_HASH; X return h; X} X Xstatic Xprint_hash() X{ X /* will not be called if hash_table == 0 */ X unsigned int i; X X for (i = 1; i < text_length(); i++) { X printf("%d: %c: ", i, buff[i]); X printf("%u\n", hash_table[i]); X } X} X Xstatic Xtally_hash(i0, i1) X unsigned int i0, i1; X{ X int i; X X for (i = 0; i < min_run_size; i++) { X if (buff[i0++] != buff[i1++]) { X tally_wrong++; X return; X } X } X tally_right++; X} X Xstatic Xprint_tally() X{ X printf("Tally_right = %d, tally_wrong = %d, ", X tally_right, tally_wrong); X printf("hash code efficiency = %d%%\n", X 100 * tally_right / (tally_right + tally_wrong)); X} X Xfree_hash() { X if (hash_table) X free((char *)hash_table); X} + END-OF-FILE hash.c chmod 'u=rw,g=r,o=r' \h\a\s\h\.\c set `sum \h\a\s\h\.\c` sum=$1 case $sum in 22444) :;; *) echo 'Bad sum in '\h\a\s\h\.\c >&2 esac echo Extracting \c\o\m\p\a\r\e\.\c sed 's/^X//' > \c\o\m\p\a\r\e\.\c << '+ END-OF-FILE '\c\o\m\p\a\r\e\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include "buff.h" X#include "text.h" X#include "top.p" X#include "top.h" X X/* from the Language Department: */ Xextern int MayBeStartOfRun(); Xextern unsigned int CheckRun(); X Xextern char options[]; Xextern int ntexts; Xextern struct text *text; Xextern int min_run_size; Xextern unsigned int *hash_table; Xextern add_run(); X Xstatic struct text *txt_at(); Xstatic unsigned int lcs(); X Xcompare() X{ X int n; X X InitTop(); X for (n = 0; n < ntexts; n++) { X struct text *txt0 = &text[n]; X unsigned int i0 = txt0->tx_start; X X while (i0 < txt0->tx_limit - min_run_size + 1) { X i0 += lcs(txt0, i0); X } X } X} X Xstatic unsigned int Xlcs(txt0, i0) X struct text *txt0; X unsigned int i0; X{ X /* find the longest common substring in: X txt0, starting precisely at i0 and X the rest of the text X */ X struct text *txt1 = txt0; X unsigned int i1 = i0; X struct text *txt_best; X unsigned int i_best; X unsigned int size_best = 0; X X if (!MayBeStartOfRun(buff[i0])) { X return 1; X } X X while( X i1 = hash_table ? hash_table[i1] : i1 + 1, X txt1 = txt_at(txt1, i1) X ) { X X if ( /* we don't want to compare a file to itself */ X options['s'] && i1 < txt0->tx_limit X ) { X /* skip this possibility */ X } X else X if ( /* we are looking at the middle of a run */ X i0 != txt0->tx_start && i1 != txt1->tx_start && X buff[i0-1] == buff[i1-1] X ) { X /* skip this possibility */ X } X else { X /* see how far we can get */ X unsigned int j0 = i0, j1 = i1; X unsigned int size = 0; X unsigned int limit0 = txt0->tx_limit; X unsigned int limit1 = txt1->tx_limit; X X while ( size < j1 - j0 && X j0 < limit0 && j1 < limit1 && X buff[j0] == buff[j1] X ) { X j0++, j1++, size++; X } X X if (size >= min_run_size) { X /* offer the run to the X Language Department X */ X size = CheckRun(&buff[i0], size); X } X X if ( /* we still have something better */ X size >= min_run_size && size > size_best X ) { X /* record it */ X txt_best = txt1; X i_best = i1; X size_best = size; X } X } X } X if (size_best) { X add_run(txt0, i0, txt_best, i_best, size_best); X return size_best; X } X else X return 1; X} X Xstatic struct text * Xtxt_at(txt, i) X struct text *txt; X unsigned int i; X{ X if (i == 0 || i >= text_length()) X return 0; X while (i >= txt->tx_limit) X txt++; X return txt; X} + END-OF-FILE compare.c chmod 'u=rw,g=r,o=r' \c\o\m\p\a\r\e\.\c set `sum \c\o\m\p\a\r\e\.\c` sum=$1 case $sum in 22558) :;; *) echo 'Bad sum in '\c\o\m\p\a\r\e\.\c >&2 esac echo Extracting \a\d\d\_\r\u\n\.\c sed 's/^X//' > \a\d\d\_\r\u\n\.\c << '+ END-OF-FILE '\a\d\d\_\r\u\n\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include "buff.h" X#include "text.h" X#include "top.p" X#include "top.h" X Xstatic set_chunk(); X Xadd_run(txt0, i0, txt1, i1, size) X struct text *txt0, *txt1; X unsigned int i0, i1; X unsigned int size; X{ X /* Adds the run of given size to our collection. X */ X struct run r; X X set_chunk(&r.rn_cn0, txt0, i0 - txt0->tx_start, size); X set_chunk(&r.rn_cn1, txt1, i1 - txt1->tx_start, size); X r.rn_quality = size; X X InsertTop(&r); X} X Xstatic Xset_chunk(cnk, txt, index, size) X struct chunk *cnk; X struct text *txt; X unsigned int index; X unsigned int size; X{ X /* Fill the chunk *cnk with info about the piece of text X in txt starting at index extending over size characters. X */ X txt->tx_needed = 1; X cnk->ch_text = txt; X cnk->ch_st_ch = index; X cnk->ch_lm_ch = index + size; X cnk->ch_st_ls = cnk->ch_lm_ls = 0; X cnk->ch_st_nl = cnk->ch_lm_nl = 1; X} + END-OF-FILE add_run.c chmod 'u=rw,g=r,o=r' \a\d\d\_\r\u\n\.\c set `sum \a\d\d\_\r\u\n\.\c` sum=$1 case $sum in 53616) :;; *) echo 'Bad sum in '\a\d\d\_\r\u\n\.\c >&2 esac echo Extracting \p\a\s\s\2\.\c sed 's/^X//' > \p\a\s\s\2\.\c << '+ END-OF-FILE '\p\a\s\s\2\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include "text.h" X#include "top.p" X#include "top.h" X#include "debug.h" X Xextern int ntexts; Xextern struct text *text; X Xstatic upd_top(), upd_chunk(); X Xpass2() { X int n; X X for (n = 0; n < ntexts; n++) { X struct text *txt = &text[n]; X unsigned int ch_cnt; X long ls_pos; X unsigned int nl_cnt = 1; X X if (!txt->tx_needed) /* an optimization */ X continue; X X if (!OpenStream(2, txt->tx_fname)) { X printf("*** File %s disappeared\n", txt->tx_fname); X OpenStream(2, "/dev/null"); X } X X while (NextPair(&ch_cnt, &ls_pos) == 0) { X /* fill in the lseek and line positions X in the collected runs X */ X nl_cnt++; X#if DEBUG == 1 X printf("pass2 on %s: ch_cnt = %u, ls_pos = %ld\n", X txt->tx_fname, ch_cnt, ls_pos); X#endif DEBUG == 1 X upd_top(txt, ch_cnt, ls_pos, nl_cnt); X } X CloseStream(); X } X} X Xstatic Xupd_top(txt, ch_cnt, ls_pos, nl_cnt) X struct text *txt; X unsigned int ch_cnt; X long ls_pos; X unsigned int nl_cnt; X{ X TopGen tp; X struct run *run; X X OpenTop(&tp); X while ((run = NextTop(&tp)), run != NoObject) { X struct chunk *cnk0 = &run->rn_cn0; X struct chunk *cnk1 = &run->rn_cn1; X X if (cnk0->ch_text == txt) X upd_chunk(cnk0, ch_cnt, ls_pos, nl_cnt); X if (cnk1->ch_text == txt) X upd_chunk(cnk1, ch_cnt, ls_pos, nl_cnt); X } X CloseTop(&tp); X} X Xstatic Xupd_chunk(cnk, ch_cnt, ls_pos, nl_cnt) X struct chunk *cnk; X unsigned int ch_cnt; X long ls_pos; X unsigned int nl_cnt; X{ X if (ch_cnt <= cnk->ch_st_ch) { X cnk->ch_st_ls = ls_pos; X cnk->ch_st_nl = nl_cnt; X } X if (cnk->ch_lm_ls == 0 && cnk->ch_lm_ch <= ch_cnt) { X cnk->ch_lm_ls = ls_pos; X cnk->ch_lm_nl = nl_cnt; X } X} + END-OF-FILE pass2.c chmod 'u=rw,g=r,o=r' \p\a\s\s\2\.\c set `sum \p\a\s\s\2\.\c` sum=$1 case $sum in 15182) :;; *) echo 'Bad sum in '\p\a\s\s\2\.\c >&2 esac echo Extracting \p\a\s\s\3\.\c sed 's/^X//' > \p\a\s\s\3\.\c << '+ END-OF-FILE '\p\a\s\s\3\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X#include X#include "params.h" X#include "text.h" X#include "top.p" X#include "top.h" X#include "debug.h" X Xextern char options[]; X Xstatic FILE *chunk_open(); Xstatic unsigned int fill_line(); Xstatic show_chunk(), show_line(), clear_line(), print_run(); X X#define MAXLINE (PAGE_WIDTH/2-2) X Xpass3() { X TopGen tp; X struct run *run; X X OpenTop(&tp); X while ((run = NextTop(&tp)), run != NoObject) { X print_run(run); X show_chunk(run); X printf("\n"); X } X CloseTop(&tp); X} X Xstatic Xprint_run(run) X struct run *run; X{ X#if DEBUG == 1 X#include "buff.h" X unsigned int i; X struct chunk *cnk0 = &run->rn_cn0; X struct chunk *cnk1 = &run->rn_cn1; X X printf("File %s vs. file %s:\n", X cnk0->ch_text->tx_fname, X cnk1->ch_text->tx_fname X ); X printf("from C-char %d,%d to %d,%d:", X cnk0->ch_st_ch, cnk1->ch_st_ch, X cnk0->ch_lm_ch, cnk1->ch_lm_ch X ); X printf(" from ASCII-char %d,%d to %d,%d:", X cnk0->ch_st_ls, cnk1->ch_st_ls, X cnk0->ch_lm_ls, cnk1->ch_lm_ls X ); X printf(" from lines %d,%d to %d,%d:", X cnk0->ch_st_nl, cnk1->ch_st_nl, X cnk0->ch_lm_nl - 1, cnk1->ch_lm_nl - 1 X ); X printf(" %d C-chars\n", run->rn_quality); X X /* show C-chars, with a one-char margin */ X for ( i = cnk0->ch_st_ch - 1; X i < cnk0->ch_lm_ch + 1; X i++ X ) { X putchar(buff[cnk0->ch_text->tx_start + i]); X } X printf("\n"); X for ( i = cnk1->ch_st_ch - 1; X i < cnk1->ch_lm_ch + 1; X i++ X ) { X putchar(buff[cnk1->ch_text->tx_start + i]); X } X printf("\n"); X#endif DEBUG == 1 X X#ifdef lint X run = run; X#endif lint X} X X Xstatic Xshow_chunk(run) X struct run *run; X{ X /* The animals came in two by two ... */ X struct chunk *cnk0 = &run->rn_cn0; X struct chunk *cnk1 = &run->rn_cn1; X unsigned int nl_cnt0 = cnk0->ch_lm_nl - cnk0->ch_st_nl; X unsigned int nl_cnt1 = cnk1->ch_lm_nl - cnk1->ch_st_nl; X FILE *f0; X FILE *f1; X char line0[MAXLINE + 1]; X char line1[MAXLINE + 1]; X extern char *sprintf(); X X sprintf(line0, "%s: line %d-%d", X cnk0->ch_text->tx_fname, X cnk0->ch_st_nl, cnk0->ch_lm_nl - 1, run->rn_quality); X sprintf(line1, "%s: line %d-%d", X cnk1->ch_text->tx_fname, X cnk1->ch_st_nl, cnk1->ch_lm_nl - 1, run->rn_quality); X show_line(line0, line1); X if (options['n']) X return; /* ... had enough so soon ... */ X X f0 = chunk_open(cnk0); X f1 = chunk_open(cnk1); X X /* fill lines and print them */ X while (nl_cnt0 != 0 || nl_cnt1 != 0) { X if (nl_cnt0) { X fill_line(f0, line0); X nl_cnt0--; X } X else clear_line(line0); X if (nl_cnt1) { X fill_line(f1, line1); X nl_cnt1--; X } X else clear_line(line1); X show_line(line0, line1); X } X X fclose(f0); X fclose(f1); X} X Xstatic FILE * Xchunk_open(cnk) X struct chunk *cnk; X{ X /* opens the file in which the chunk resides and positions X the file at the beginning of the chunk X */ X char *fname = cnk->ch_text->tx_fname; X FILE *f = fopen(fname, "r"); X X if (f == NULL) { X printf("*** File %s disappeared\n", fname); X f = fopen("/dev/null", "r"); X } X fseek(f, cnk->ch_st_ls, 0); X return f; X} X Xstatic unsigned int Xfill_line(f, ln) X FILE *f; X char ln[]; X{ X /* Reads one line from f and puts it in condensed form in ln. X */ X int ch; X int indent = 0, lpos = 0; X X /* condense and skip initial blank */ X while ((ch = getc(f)), ch == ' ' || ch == '\t') { X if (ch == '\t') X indent = 8; X else X indent++; X if (indent == 8) { X /* every eight blanks give one blank */ X if (lpos < MAXLINE) X ln[lpos++] = ' '; X indent = 0; X } X } X X /* store the rest */ X while (ch >= 0 && ch != '\n') { X if (ch == '\t') /* replace tabs by blanks */ X ch = ' '; X if (lpos < MAXLINE) X ln[lpos++] = ch; X ch = getc(f); X } X ln[lpos] = '\0'; /* always room for this one */ X} X Xstatic Xclear_line(ln) X char ln[]; X{ X /* a simple null byte will suffice */ X ln[0] = '\0'; X} X Xstatic Xshow_line(ln0, ln1) X char ln0[], ln1[]; X{ X int i; X X for (i = 0; i < MAXLINE && ln0[i] != '\0'; i++) X putchar(ln0[i]); X for (; i < MAXLINE; i++) X putchar(' '); X printf(" |"); X X for (i = 0; i < MAXLINE && ln1[i] != '\0'; i++) X putchar(ln1[i]); X printf("\n"); X} + END-OF-FILE pass3.c chmod 'u=rw,g=r,o=r' \p\a\s\s\3\.\c set `sum \p\a\s\s\3\.\c` sum=$1 case $sum in 35264) :;; *) echo 'Bad sum in '\p\a\s\s\3\.\c >&2 esac echo Extracting \m\a\i\n\.\c sed 's/^X//' > \m\a\i\n\.\c << '+ END-OF-FILE '\m\a\i\n\.\c X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X/* X This is a service program for the similarity tester. X A call of 'cstream -1 inp.c' yields the tokens of inp.c as X single characters, as used by pass1 of sim. X A call of 'cstream -2 inp.c' yields a list of pairs, one for each line. This is used by X pass3. X*/ X X#include X Xchar options[128]; X Xmain(argc, argv) X char *argv[]; X{ X if (argc != 3) { X fprintf(stderr, "Call is: %s -[12] inp.c\n", argv[0]); X return 1; X } X X if (!OpenStream(argv[1][1] - '0', argv[2])) { X fprintf(stderr, "%s: cannot open\n", argv[2]); X return 1; X } X X if (argv[1][1] == '1') { X char ch; X X while (NextChar(&ch) == 0) X putchar(ch); X } X else X if (argv[1][1] == '2') { X unsigned int ch_cnt; X long ls_pos; X X while (NextPair(&ch_cnt, &ls_pos) == 0) X printf("%ld,%ld\n", ch_cnt, ls_pos); X } X return 0; X} + END-OF-FILE main.c chmod 'u=rw,g=r,o=r' \m\a\i\n\.\c set `sum \m\a\i\n\.\c` sum=$1 case $sum in 07408) :;; *) echo 'Bad sum in '\m\a\i\n\.\c >&2 esac echo Extracting \c\l\a\n\g\.\l sed 's/^X//' > \c\l\a\n\g\.\l << '+ END-OF-FILE '\c\l\a\n\g\.\l X%{ X/* This file is part of the software similarity tester SIM. X Written by Dick Grune, Vrije Universiteit, Amsterdam. X*/ X X/* X C language front end for the similarity tester. X*/ X X/* Language-dependent Code */ X#include "idf.h" X Xstatic struct idf ppcmd[] = { X "define", 'D', X "else", 'E', X "endif", 'Z', X "if", 'F', X "ifdef", 'Y', X "ifndef", 'N', X "include", 'I', X "line", 'L', X "undef", 'U' X}; X Xstatic struct idf reserved[] = { X "auto", 'a', X "break", 'b', X "case", 'k', X "char", 'c', X "continue", 'z', X "default", '_', X "do", 'd', X "double", 'm', X "else", 'e', X "enum", 'n', X "extern", 'q', X "float", 'y', X "for", 'f', X "goto", 'g', X "if", 'i', X "int", 'j', X "long", 'l', X "register", '\0', /* ignore */ X "return", 'r', X "short", 'h', X "sizeof", 'o', X "static", 'p', X "struct", 's', X "switch", 'x', X "typedef", 't', X "union", 'u', X "unsigned", 'v', X "void", '\0', /* ignore */ X "while", 'w' X}; X Xstatic int Xis_trailer(ch) { X return ch == ')' || ch == '}' || ch == ';'; X} X Xint XMayBeStartOfRun(ch) { X return !is_trailer(ch); X} X Xunsigned int XCheckRun(str, size) char *str; unsigned int size; { X /* Checks the run starting at str with length size for X acceptability in the language. Cuts from the end if X necessary and returns the accepted length (which may X be zero). X */ X extern char options[]; X X if (options['f']) { /* function-like forms only */ X unsigned int pos; X unsigned int lb_pos = 0;/* latest balancing position */ X int braces = 0; X int parens = 0; X int brackets = 0; X X for (pos = 0; pos < size; pos++) { X switch (str[pos]) { X case '{': braces++; break; X case '}': braces--; break; X case '(': parens++; break; X case ')': parens--; break; X case '[': brackets++; break; X case ']': brackets--; break; X } X if ( /* this was one closer too many */ X braces < 0 || parens < 0 || brackets < 0 X ) { X break; X } X if ( /* it happens to balance here */ X braces == 0 && parens == 0 && brackets == 0 X ) { X lb_pos = pos + 1; X } X } X size = lb_pos; /* cut to size */ X } X else { X while ( /* there is trailing garbage */ X size != 0 && X (str[size - 1] == '@' || is_trailer(str[size - 1])) X ) { X /* remove it */ X size--; X } X } X return size; X} X X/* Language-INdependent Code */ X#include "stream.h" X Xyystart() { X BEGIN INITIAL; X} X Xstatic int Xyywrap() { X return 1; X} X X%} X X%Start Comment X XAnyQuoted (\\.) XStrChar ([^"\n\\]|{AnyQuoted}) XChrChar ([^'\\]|{AnyQuoted}) XComChar ([^*\n]|(\*[^/])) XIdf ([A-Za-z][A-Za-z0-9_]*) X X%% X X\"{StrChar}*\" { /* strings */ X cput('"'); X count(); X } X X\'{ChrChar}\' { /* characters */ X cput('\''); X count(); X } X X"/*" { /* We cannot have one pattern for a comment X (although one can be written), since the matched X string would overflow lex-internal buffers like X yysbuf and yytext. So we have to break up the string X into lines and keep track of where we are in a start X condition . X */ X BEGIN Comment; X count(); X } X X{ComChar}* { /* comment up to \n or end-of-comment */ X count(); X } X X"*/" { /* end-of-comment */ X BEGIN INITIAL; X count(); X } X X#[ \t]*include.* { /* skip #include line */ X count(); X } X X#[ \t]*{Idf} { /* a preprocessor line */ X char *n = yytext+1; X int ch; X X /* skip layout in front of preprocessor identifier */ X while (*n == ' ' || *n == '\t') X n++; X ch = idf2char(n, ppcmd); X if (ch < 0) { X cput('#'); X } X else X cput(ch); X count(); X } X X{Idf} { X int ch = idf2char(yytext, reserved); X X if (ch < 0) X cput('@'); X else X if (ch > 0) X cput(ch); X count(); X } X X[ \t] { /* layout */ X count(); X } X X\n { /* count newlines */ X count(); X c_eol(); X } X X. { /* copy other text */ X cput(yytext[0]); X count(); X } X X%% + END-OF-FILE clang.l chmod 'u=rw,g=r,o=r' \c\l\a\n\g\.\l set `sum \c\l\a\n\g\.\l` sum=$1 case $sum in 21016) :;; *) echo 'Bad sum in '\c\l\a\n\g\.\l >&2 esac exit 0