-h- readme.txt Fri Jun 3 10:49:44 1988 USER1:[MINOW.PERSONAL.SOURCE.LZ]README.TXT;2 This is a rewrite of the Unix compress utility. It is *not* switch-compatible with Unix compress, however it is (almost) file-compatible (when compiled on Unix, or when "export" mode is selected on VMS Version 4). The advantages of this version are as follows: 1. Compress and decompress are separate programs, simplifying the problems of the small system implementor. Both run on an unmapped PDP-11 (with a maximum of 12 bits). The command interface is just lzcomp input compressed_output lzdcmp compressed_input output Input files are not deleted. 2. The compression algorithm and I/O design is intended to simplify embedding the programs (as subroutines) in some other task. (for example, in a database package.) 3. On non-Unix systems, the I/O design should be significantly faster. It should be slightly faster on Unix. The only disadvantage is that, as noted, it is not command (option) compatible with Unix compress. Also, some peripheral functionality (such as the deletion of input files and the output file naming conventions) have not been implemented. On Unix (i.e., in "export" mode), the compressed data file is identical to the Unix file, *except* that lzcomp writes two CLEAR codes in a row to signal end-of-file (and lzdcmp treats two CLEAR codes in a row as signalling end-of-file). This means that, if you compress a file using lzcomp and decompress it using Unix compress, there will be a couple of extra bytes added at the end of the decompressed output file. lzcomp and lzdcmp have been added to the Decus C distribution. Martin Minow decvax!minow, minow%rex.dec@decwrl.arpa -h- descrip.mms Fri Jun 3 10:49:44 1988 USER1:[MINOW.PERSONAL.SOURCE.LZ]DESCRIP.MMS;85 ! linkflags = /debug lzcomp_objs = lzcmp1.obj lzcmp2.obj lzcmp3.obj lzcomp.obj lzdcmp_objs = lzdcm1.obj lzdcm2.obj lzdcm3.obj lzdcmp.obj common_objs = lzio.obj lzvio.obj lzdcl.obj lz : lzcomp.exe lzdcmp.exe lz.hlb lz.com copy nl: nl: list : lz.lst linepr lz.h *.cld lz*.c # # You cannot build the full kit without access to Decus C tools, # (getrno and scopy), which are not included in the lz distribution. # full_kit : lzcomp.exe lzdcmp.exe lz.hlb lz.com lzcomp.mem lzdcmp.mem copy nl: nl: # # Build and test lz # triptest : lzcomp.exe lzdcmp.exe triptest.com @ triptest # # Note: the link continuation lines are preceeded by two 's # which are recognized by fixmms.com. lz.com is in the dependency # list so the programs are re-linked when descrip.mms changes. # lzcomp.exe : $(lzcomp_objs) $(common_objs) lz.com link/nomap/exe=lzcomp $(linkflags) - lzcmp1.obj,lzcmp2.obj,lzcmp3.obj,lzcomp.obj, - lzio.obj,lzvio.obj,lzdcl.obj, - sys$library:vaxcrtl/lib lzdcmp.exe : $(lzdcmp_objs) $(common_objs) lz.com link/nomap/exe=lzdcmp $(linkflags) - lzdcm1.obj,lzdcm2.obj,lzdcm3.obj,lzdcmp.obj, - lzio.obj,lzvio.obj,lzdcl.obj, - sys$library:vaxcrtl/lib lz.com : descrip.mms fixmms.com mms/noaction/from_sources/output=lz.tmp fixmms :== @'f$logical("SYS$DISK")''f$directory()'fixmms.com fixmms lz.tmp lz.com delete lz.tmp;* # # Documentation is created using Decus C tools. These are stored # in a directory defined by the BIN: logical: # getrno reads .C source, producing .RNO # scopy fixes file attributes. # lzcomp.mem : lzcomp.rno scopy :== $bin:scopy runoff lzcomp scopy lzcomp.mem lzcomp.rno : lzcmp1.c getrno :== $bin:getrno getrno lzcmp1.c -o lzcomp.rno lzdcmp.mem : lzdcmp.rno scopy :== $bin:scopy runoff lzdcmp scopy lzdcmp.mem lzdcmp.rno : lzdcm1.c getrno :== $bin:getrno getrno lzdcm1.c -o lzdcmp.rno # # The archive creator uses the Decus C text archive programs. # I store these in a tools: directory. If you change this, # be sure to edit makefile.txt (Unix make command file) # archive : lzarch.arc lz1.arc lz2.arc lz3.arc lz4.arc copy nl: nl: lzarch_arc = 1streadme.txt tools:archx.c tools:archc.c lz1a_arc = readme.txt descrip.mms makefile.txt lz.com fixmms.com lz1b_arc = lzcomp.mem lzdcmp.mem lz2_arc = lzcmp1.c lzcmp2.c lzcmp3.c lz3_arc = lzdcm1.c lzdcm2.c lzdcm3.c lz.hlp lz4_arc = lz.h lzcomp.cld lzdcmp.cld lzdcl.c lzio.c lzvio.c lzarch.arc : $(lzarch_arc) archc :== $bin:archc archc $(lzarch_arc) >lzarch.arc lz1.arc : $(lz1a_arc) $(lz1b_arc) archc :== $bin:archc archc $(lz1a_arc) >lz1.arc archc $(lz1b_arc) >>lz1.arc lz2.arc : $(lz2_arc) archc :== $bin:archc archc $(lz2_arc) >lz2.arc lz3.arc : $(lz3_arc) archc :== $bin:archc archc $(lz3_arc) >lz3.arc lz4.arc : $(lz4_arc) archc :== $bin:archc archc $(lz4_arc) >lz4.arc # # Object module dependencies # lzcmp1.obj : lz.h lzcmp1.c lzcmp2.obj : lz.h lzcmp2.c lzcmp3.obj : lz.h lzcmp3.c lzdcm1.obj : lz.h lzdcm1.c lzdcm2.obj : lz.h lzdcm2.c lzdcm3.obj : lz.h lzdcm3.c lzio.obj : lz.h lzio.c lzvio.obj : lz.h lzvio.c lzdcl.obj : lz.h lzdcl.c lzcomp.obj : lzcomp.cld lzdcmp.obj : lzdcmp.cld lz.hlb : lz.hlp -h- makefile.txt Fri Jun 3 10:49:44 1988 USER1:[MINOW.PERSONAL.SOURCE.LZ]MAKEFILE.TXT;8 # Unix makefile for lzcomp, lzdcmp # # The redefinition of strchr() and strrchr() are needed for # Ultrix-32, Unix 4.2 bsd (and maybe some other Unices). # BSDDEFINE = -Dstrchr=index -Dstrrchr=rindex # # On certain systems, such as Unix System III, you may need to define # $(LINTFLAGS) in the make command line to set system-specific lint flags. # CFLAGS = -O $(BSDDEFINES) all : lzcomp lzdcmp # # ** compile lzcomp # LZCOMP_SRCS = lzcmp1.c lzcmp2.c lzcmp3.c lzio.c LZCOMP_OBJS = lzcmp1.o lzcmp2.o lzcmp3.o lzio.o lzcomp: $(LZCOMP_OBJS) $(CC) $(CFLAGS) $(LZCOMP_OBJS) -o lzcomp # # ** compile lzdcmp # LZDCMP_SRCS = lzdcm1.c lzdcm2.c lzdcm3.c lzio.c LZDCMP_OBJS = lzdcm1.o lzdcm2.o lzdcm3.o lzio.o lzdcmp: $(LZDCMP_OBJS) $(CC) $(CFLAGS) $(LZDCMP_OBJS) -o lzdcmp # # ** Lint the code # lint: $(LZCOMP_SRCS) $(LZDCMP_SRCS) lint $(LINTFLAGS) $(DEFINES) $(LZCOMP_SRCS) lint $(LINTFLAGS) $(DEFINES) $(LZDCMP_SRCS) # # ** Remove unneeded files # clean: rm -f $(OBJS) lzcomp lzdcmp # # ** Rebuild the archive files # ** Uses the Decus C archive utility. # archive: lzarch_arc = 1streadme.txt archx.c archc.c lz1a_arc = readme.txt descrip.mms makefile.txt lz.com fixmms.com lz1b_arc = lzcomp.mem lzdcmp.mem lz2_arc = lzcmp1.c lzcmp2.c lzcmp3.c lz3_arc = lzdcm1.c lzdcm2.c lzdcm3.c lz.hlp lz4_arc = lz.h lzcomp.cld lzdcmp.cld lzdcl.c lzio.c lzvio.c makefile.txt : Makefile cp Makefile makefile.txt lzarch.arc : $(lzarch_arc) archc archc $(lzarch_arc) >lzarch.arc lz1.arc : $(lz1a_arc) $(lz1b_arc) archc archc $(lz1a_arc) >lz1.arc archc $(lz1b_arc) >>lz1.arc lz2.arc : $(lz2_arc) archc archc $(lz2_arc) >lz2.arc lz3.arc : $(lz3_arc) archc archc $(lz3_arc) >lz3.arc lz4.arc : $(lz4_arc) archc archc $(lz4_arc) >lz4.arc archc : archc.c cc archc.c mv a.out archc # # Object module dependencies # lzcmp1.o : lzcmp1.c lz.h lzcmp2.o : lzcmp2.c lz.h lzcmp3.o : lzcmp3.c lz.h lzio.o : lzio.c lz.h lzdcm1.o : lzdcm1.c lz.h lzdcm2.o : lzdcm2.c lz.h lzdcm3.o : lzdcm3.c lz.h -h- lz.com Fri Jun 3 10:49:44 1988 USER1:[MINOW.PERSONAL.SOURCE.LZ]LZ.COM;7 $ CC /NOLIST/OBJECT=LZCMP1 LZCMP1.C $ CC /NOLIST/OBJECT=LZCMP2 LZCMP2.C $ CC /NOLIST/OBJECT=LZCMP3 LZCMP3.C $ SET COMMAND /OBJECT=LZCOMP LZCOMP.CLD $ CC /NOLIST/OBJECT=LZIO LZIO.C $ CC /NOLIST/OBJECT=LZVIO LZVIO.C $ CC /NOLIST/OBJECT=LZDCL LZDCL.C $ mms/noaction/from_sources/output=lz.tmp $ fixmms :== @'f$logical("SYS$DISK")''f$directory()'fixmms.com $ fixmms lz.tmp lz.com $ delete lz.tmp;* $ link/nomap/exe=lzcomp /TRACE/NOMAP/EXEC=LZCOMP - lzcmp1.obj,lzcmp2.obj,lzcmp3.obj,lzcomp.obj, - lzio.obj,lzvio.obj,lzdcl.obj, - sys$library:vaxcrtl/lib $ CC /NOLIST/OBJECT=LZDCM1 LZDCM1.C $ CC /NOLIST/OBJECT=LZDCM2 LZDCM2.C $ CC /NOLIST/OBJECT=LZDCM3 LZDCM3.C $ SET COMMAND /OBJECT=LZDCMP LZDCMP.CLD $ link/nomap/exe=lzdcmp /TRACE/NOMAP/EXEC=LZDCMP - lzdcm1.obj,lzdcm2.obj,lzdcm3.obj,lzdcmp.obj, - lzio.obj,lzvio.obj,lzdcl.obj, - sys$library:vaxcrtl/lib $ If "''F$Search("LZ.HLB")'" .EQS. "" Then LIBRARY/Create/Help LZ.HLB $ LIBRARY/REPLACE LZ.HLB LZ.HLP $ copy nl: nl: -h- fixmms.com Fri Jun 3 10:49:44 1988 USER1:[MINOW.PERSONAL.SOURCE.LZ]FIXMMS.COM;4 $! $! Hack command file to eliminate long lines in command files generated $! by MMS/noaction/from_sources/output=foo.tmp. Note that continuation $! files must be preceeded by two characters. $! $! Usage: $! @ fixmms input output $! $ on error then exit $ on warning then exit $ on control_y then exit $ tab[0,7] = 9 ! Define character $ tabs = tab + tab ! 2 tabs at long line $ open/read src 'p1' $ open/write dst 'p2' $ loop: read/end=endfile src line $ spaceover = "" $ more: i = f$locate(tabs, line) $ if (i .ne. f$length(line)) then goto gotcha $ write dst spaceover, line $ goto loop $ gotcha: $ write dst spaceover, f$extract(0, i, line), "-" $ line = f$extract(i + 2, f$length(line), line) $ spaceover = tabs $ goto more $ endfile: $ close src $ close dst $ exit -h- lzcomp.mem Fri Jun 3 10:49:48 1988 USER1:[MINOW.PERSONAL.SOURCE.LZ]LZCOMP.MEM;9 ____ ___________ 1 File Compression ********** * lzcomp * ********** NAME: lzcomp -- File Compression SYNOPSIS: lzcomp [-options] [infile [outfile]] DESCRIPTION: lzcomp implements the Lempel-Ziv file compression algorithm. (Files compressed by lzcomp are uncompressed by lzdcmp.) It operates by finding common substrings and replaces them with a variable-size code. This is deterministic, and can be done with a single pass over the file. Thus, the decompression procedure needs no input table, but can track the way the table was built. Options may be given in either case. -B Input file is "binary", not "human readable text". This is necessary on Dec operating systems, such as VMS and RSX-11M, that treat these files differently. (Note that binary support is rudamentary and probably insufficient as yet.) (On VMS version 4, this is ignored unless the -x option is specified or the input file is record-oriented.) -M bits Write using the specified number of bits in the code -- necessary for big machines making files for little machines. For example, if compressing a file on VMS which is to be read on a PDP-11, you should select -M 12. -V [n] Verbose if specified. If a value is specified, it will enable debugging code (if compiled in). The values are bit-encoded as follows: -V 0 Don't print anything -V 1 Print a summary of the operation -V 2 Print a running status on occassion -V 4 Print some debug information. -V 8 Print too much debug information -V 16 Dump the compressed files Page 2 lzcomp File Compression -S Perform a transformation on the input file whereby each byte is subtracted from its predecessor. I.e., the difference between bytes is compressed, rather than their actual value. This improves the performance of the algorithm on certain kinds of data files. However, it will worsen the compression of most other kinds of data. Note that files compressed with the -S option cannot be decompressed on versions of Unix compress (or lzdcmp) that do not support this option. The -S option is useful for digitized video images, where each pixel element is represented as a single byte. It may also be useful when compressing digitized encoding of similar analog data, where each sample is contained in a single byte. -X [n] "Export" -- write a file format that can be read by other operating systems. Only the bytes in the file are copied; file attributes are not preserved. If specified, the value determines the level of compatiblity. If not specified, or specified with an explicit value of zero, and lzcomp is running on Vax/VMS version 4 under VaxC and the input file is a disk or magtape file (block-oriented), a VMS-private output format is used which is incompatible with the Unix compress utility, but which preserves VMS file attributes. -X may take on the following values: 0 Choose VMS private format. See restrictions below. 1 Compatible with Unix compress version 3.0, except that an end code is written that may not be compatible with Unix compress. This is the default if -x is given without a value. This is the correct mode to use when transferring files between VMS and RSX-11M, RSTS/E, or RT11, or when transferring files to LZDCMP running on Unix. 2 As above, but do not write the extra end code. This will cause problems when decompressing on RSX-11M and RT11 systems. 3 As in (2) above, but suppress "block compression". 4 As in (3) above, but do not output a compress header block. This is for compatiblity with a quite early version of Unix compress (and requires conditional-compilation to use). Page 3 lzcomp File Compression Note that the -B (binary) option is ignored unless the input file is "record-oriented", such as a terminal or mailbox. The other two arguments are the input and output filenames respectively. Redirection is supported, however, the output must be a disk/tape file. The file format is almost identical to the current Unix implementation of compress (V4.0). Files written by Unix compress should be readable by lzdcmp. Files written by lzcomp in export (-x) format will be readable by Unix compress (except that lzcomp outputs two "clear" codes to mark EOF. A patch to Unix compress is available.) VMS COMMAND LANGUAGE INTERFACE: In addition to the above (Unix-style) command line interface, lzcomp supports a VMS command line interface. The following options are available: /BITS= /EXPORT=(VMS, UNIX, BLOCK, HEADER, ENDMARKER) /MODE=(DELTA) /SHOW=(ALL, PROGRESS, STATISTICS, FDL, DEBUG, DEBUG_SERIOUS, DEBUG_IO) VMS RESTRICTIONS: VMS Private mode stores the true name and attributes of the input file into the compressed file and lzdcmp restores the attributes (and filename if requested). The following restrictions apply -- they may be lifted in the future as they are primarily due to the author's lack of understanding of the intricacies of of VMS I/O: All files must be stored on disk. The lzcomp output file must be specified directly. Also, for all usage on VMS, the compressed file must be written to, and read from disk. The following file attributes are not preserved by lzcomp: File ownership and protection codes. Date of creation, access, and backup. Access control lists. RSX-11M RESTRICTIONS: Page 4 lzcomp File Compression lzcomp cannot determine the file attributes and may not correctly read certain specialized file formats, such as "print image". If a binary file is compressed, note that it will be decompressed as "fixed-block, 512 byte" records. LZW COMPRESSION ALGORITHM: This section is abstracted from Terry Welch's article referenced below. The algorithm builds a string translation table that maps substrings in the input into fixed-length codes. The compress algorithm may be described as follows: 1. Initialize table to contain single-character strings. 2. Read the first character. Set (the prefix string) to that character. 3. (step): Read next input character, K. 4. If at end of file, output code(); exit. 5. If K is in the string table: Set to K; goto step 3. 6. Else K is not in the string table. Output code(); Put K into the string table; Set to K; Goto step 3. "At each execution of the basic step an acceptable input string has been parsed off. The next character K is read and the extended string K is tested to see if it exists in the string table. If it is there, then the extended string becomes the parsed string and the step is repeated. If K is not in the string table, then it is entered, the code for the successfully parsed string is put out as compressed data, the character K becomes the beginning of the next string, and the step is repeated." The decompression algorithm translates each received code into a prefix string and extension [suffix] character. The extension character is stored (in a push-down stack), and the prefix translated again, until the prefix is a single character, which completes decompression of this code. The entire code is then output by popping the stack. "An update to the string table is made for each code received (except the first one). When a code has been translated, its final character is used as the extension character, combined with the prior string, to add a new string to the string table. This new string is assigned a unique code value, which is the same code that the compressor assigned to that string. In this way, the decompressor incrementally reconstructs the same string Page 5 lzcomp File Compression table that the decompressor used.... Unfortunately ... [the algorithm] does not work for an abnormal case. The abnormal case occurs whenever an input character string contains the sequence KKK, where K already appears in the compressor string table." The decompression algorithm, augmented to handle the abnormal case, is as follows: 1. Read first input code; Store in CODE and OLDcode; With CODE = code(K), output(K); FINchar = K; 2. Read next code to CODE; INcode = CODE; If at end of file, exit; 3. If CODE not in string table (special case) then Output(FINchar); CODE = OLDcode; INcode = code(OLDcode, FINchar); 4. If CODE == code(K) then Push K onto the stack; CODE == code(); Goto 4. 5. If CODE == code(K) then Output K; FINchar = K; 6. While stack not empty Output top of stack; Pop stack; 7. Put OLDcode,K into the string table. OLDcode = INcode; Goto 2. The algorithm as implemented here introduces two additional complications. The actual codes are transmitted using a variable-length encoding. The lowest-level routines increase the number of bits in the code when the largest possible code is transmitted. Periodically, the algorithm checks that compression is still increasing. If the ratio of input bytes to output bytes decreases, the entire process is reset. This can happen if the characteristics of the input file change. VMS PRIVATE FILE STRUCTURE: In VMS Private mode, the compressed data file contains a variable-length (but compressed) file header with the Page 6 lzcomp File Compression file "attributes" needed by the operating system to construct the file. This allows the decompression program to recreate the file in its original format, which is essential if ISAM databases are compressed. The overall file format is as follows: LZ_SOH "start of header" signal (this value cannot appear in user data). A variable-length data record (maximum 256 bytes) containing the header name, followed by whitespace, followed by header-specific information. In this case, the name record will contain the string "vms$attributes" followed by the number of bytes in the attribute data block. (I assume that the name record will consist of a facility name, such as "vms", followed by a dollar sign, followed by a facility-unique word.) LZ_EOR Signals "end of record". This is followed by a VMS file attributes record (generated by a VMS system library routine). LZ_EOR Signals "end of record" (optional). This is followed by another "header record" with additional information. Currently, this is only used to transmit the "longest record length" field from the input file. Additional header records may be defined in the future. LZ_ETX Signals "end of segment". ST_STX Signals "start of text" (i.e., start of data file). This is followed by the user data file. LZ_ETX Signals "end of segment" LZ_ETX Two in a row signals "end of file". Note that this format can easily be extended to include trailer records (with file counts and checksums) and/or multiple data files in one compressed file. Note also that the LZ_CLEAR code may appear in headers or data files to cause the decompression program to Page 7 lzcomp File Compression "readapt" to the characteristics of the input data. LZ_STX and LZ_SOH reset the compression algorithm. LZ_EOR does not. AUTHORS: The algorithm is from "A Technique for High Performance Data Compression." Terry A. Welch. IEEE Computer Vol 17, No. 6 (June 1984), pp 8-19. This revision is by Martin Minow. Unix Compress authors are as follows: Spencer W. Thomas (decvax!harpo!utah-cs!utah-gr!thomas) Jim McKie (decvax!mcvax!jim) Steve Davies (decvax!vax135!petsd!peora!srd) Ken Turkowski (decvax!decwrl!turtlevax!ken) James A. Woods (decvax!ihnp4!ames!jaw) Joe Orost (decvax!vax135!petsd!joe) -h- lzdcmp.mem Fri Jun 3 10:49:48 1988 USER1:[MINOW.PERSONAL.SOURCE.LZ]LZDCMP.MEM;4 ____ _____________ 1 File Decompression ********** * lzdcmp * ********** NAME: lzdcmp -- File Decompression SYNOPSIS: lzdcmp [-options] [infile [outfile]] DESCRIPTION: lzdcmp decompresses files compressed by lzcomp. The documentation for lzcomp describes the process in greater detail. Options may be given in either case. -B Output file is "binary", not text. (Ignored in VMS private mode.) On VMS, this generates a stream file without carriage-control attributes. On Decus C systems, it generates a "fixed block, 512 byte record" file. It is unneeded on Unix. -F Output file is "fixed block, 512 bytes." This is used only by VMS for reading files created by Unix compress or by "export" mode. It is identical to binary on Decus C. -X 3 To read files compressed by an old Unix version that doesn't generate header records. -V val Verbose (print status messages and debugging information). The value selects the amount of verbosity. See the lzcomp documentation for details. VMS COMMAND LANGUAGE INTERFACE: In addition to the above (Unix-style) command line interface, lzcomp supports a VMS command line interface. The following options are available: /EXPORT=(VMS, UNIX, HEADER) Page 2 lzdcmp File Decompression /MODE=(TEXT, BINARY, FIXED) /SHOW=(ALL, PROGRESS, STATISTICS, FDL, DEBUG, DEBUG_SERIOUS, DEBUG_IO) VMS PRIVATE MODE: If the file was compressed in VMS private mode, all information needed to reconstruct the file is stored in the compressed file, using the VMS run-time library FDL routines. This means that the expanded file will have the same name and directory location it had originally. If the directory structure does not exist (for example, because you have moved the compressed file to another machine), you must specify the second argument to lzdcmp to specify the file name. AUTHOR: This version by Martin Minow. See lzcomp for more details.