.TITLE SOUNDEX_DESC .IDENT /V1.09/ ; status = soundex_desc(in_desc, out_desc) ; Returns a soundex string (str) for the argument string (string). ; This version works with arguments passed as descriptors (in ; VAX FORTRAN, this can be CHARACTER*n data types). ; Function almost always returns SS$_NORMAL for status. Will return ; an error for an non-existant input or output address. ; This version skips over punctuation marks ; V1.8: fix some problems when the second character is not ; a letter (as in X-RAY) 01-May-1989 ; Also add longword alignment of code. ; V1.9: varying string area for input copy 25-Jun-1990 ; B. Z. Lederman ; The following definition (without the ;! characters) will make ; the function available in Datatrieve-32 ;!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ; FN$SOUNDEX - Return a Soundex value for a character string ; ; B. Z. Lederman. Requres SOUNDEX_DESC.MAR ; ; output is the Soundex value in a 4 character string (text) ; input is a source string descriptor ; ;!$DTR$FUN_DEF FN$SOUNDEX, SOUNDEX_DESC, 2 ;! $DTR$FUN_OUT_ARG TYPE = FUN$K_STATUS ;! $DTR$FUN_HEADER HDR = <"Soundex"> ;! $DTR$FUN_EDIT_STRING ^\X(4)\ ;! $DTR$FUN_IN_ARG TYPE = FUN$K_DESC, DTYPE = DSC$K_DTYPE_T, ORDER = 1 ;! $DTR$FUN_IN_ARG TYPE = FUN$K_TEXT, OUT_PUT = TRUE, ALL_LEN = 4 ;!$DTR$FUN_END_DEF ;!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! .PSECT P_DATA, LONG, CON, REL, LCL, SHR, NOEXE, RD, NOWRT ; S_V should be zero for efficiency (test against zero later). S_V = 0 ; Vowels: a e i o u (h, w, y) S_BP = ^A'1' ; labials b, p S_SK = ^A'2' ; "hard" consonants: c k q s x z S_TD = ^A'3' ; Dental stops: t d S_L = ^A'4' ; l S_M = ^A'5' ; m (sometimes includes n) S_R = ^A'6' ; r S_FV = ^A'7' ; f v S_GJ = ^A'8' ; g j S_N = ^A'9' ; n SKIP = -1 ; characters to skip over (punctuation, etc.) TERM = -2 ; characters to ignore or which terminate processing ; (Blank, Escape, Carriage Return, Line Feed, etc.) ; The following are the subtitute values for each letter. SX_LETTERS: .BYTE TERM, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP ; NUL-Bell .BYTE SKIP, TERM, TERM, TERM, TERM, TERM, SKIP, SKIP ; BS - SI .BYTE SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP ; DLE - ETB .BYTE SKIP, SKIP, SKIP, TERM, SKIP, SKIP, SKIP, SKIP ; CAN - US .BYTE TERM, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP ; Space - ' .BYTE SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP ; ( - / .BYTE SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP ; 0-7 .BYTE SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP ; 8-? .BYTE SKIP, S_V, S_BP, S_SK, S_TD, S_V, S_FV, S_GJ ; @, A-G .BYTE S_V, S_V, S_GJ, S_SK, S_L, S_M, S_N, S_V ; H-O .BYTE S_BP, S_SK, S_R, S_SK, S_TD, S_V, S_FV, S_V ; P-W .BYTE S_SK, S_V, S_SK, SKIP, SKIP, SKIP, SKIP, SKIP ; X-_ ; repeat for lower case (mostly as a place holder) .BYTE SKIP, S_V, S_BP, S_SK, S_TD, S_V, S_FV, S_GJ ; `, a-g .BYTE S_V, S_V, S_GJ, S_SK, S_L, S_M, S_N, S_V ; h-o .BYTE S_BP, S_SK, S_R, S_SK, S_TD, S_V, S_FV, S_V ; p-w .BYTE S_SK, S_V, S_SK, SKIP, SKIP, SKIP, SKIP, TERM ; x, y, z ; Start 8 bit characters. Ignore control and punctuation .BYTE SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP .BYTE SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP .BYTE TERM, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP ; DCS .BYTE SKIP, SKIP, SKIP, TERM, SKIP, SKIP, SKIP, SKIP ; CSI .BYTE SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP .BYTE SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP .BYTE SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP .BYTE SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP, SKIP .BYTE S_V, S_V, S_V, S_V, S_V, S_V, S_V, S_SK ; cedilla .BYTE S_V, S_V, S_V, S_V, S_V, S_V, S_V, S_V ; All accents .BYTE SKIP, S_N, S_V, S_V, S_V, S_V, S_V, S_V ; N with tilde .BYTE S_V, S_V, S_V, S_V, S_V, S_V, SKIP, S_SK ; Lower case is again mostly a place-holder .BYTE S_V, S_V, S_V, S_V, S_V, S_V, S_V, S_SK ; cedilla .BYTE S_V, S_V, S_V, S_V, S_V, S_V, S_V, S_V ; All accents .BYTE SKIP, S_N, S_V, S_V, S_V, S_V, S_V, S_V ; N with tilde .BYTE S_V, S_V, S_V, S_V, S_V, S_V, SKIP, SKIP ; The first letter of the following consonant pairs is silent, and is ; replaced with the second letter. LEADING_SILENT: .ASCII /TTGKPPPM/ NEXT_SILENT: .ASCII /SZHNNTFN/ .PSECT L_OCAL, LONG, CON, REL, LCL, NOSHR, NOEXE, RD, WRT IN_COPY_LEN = 64 ; should be a long enough string OUT_COPY_LEN = 4 ; maximum size of output string ; Use a local copy of input string to work with IN_DSC_LEN: .WORD IN_COPY_LEN .BYTE DSC$K_DTYPE_VT ; descriptor type .BYTE DSC$K_CLASS_VS ; descriptor class IN_DSC_ADR: .ADDRESS IN_BUF ; Make a string to use for temporary output OUT_DSC_LEN: .WORD OUT_COPY_LEN .BYTE DSC$K_DTYPE_T ; descriptor type .BYTE DSC$K_CLASS_S ; descriptor class OUT_DSC_ADR: .ADDRESS OUT_BUF ; Local working space IN_BUF: .BLKB IN_COPY_LEN OUT_BUF: .BLKB OUT_COPY_LEN ; Compute the soundex string for the argument string. somewhat ; modified from the original algorithm by Martin Minow ; "Margaret K. Odell and Robert C. Russell. U.S. patents ; 1261167 (1918) and 1435663 (1922)." as reprinted in Donald Knuth, ; "Sorting and Searching." ; Re-modified to return it to character string output and more reasonable ; value assignments for letters (can't have more than 10 values) ; R11 is the current character in the input string ; R10 is the previous character ; R9 is a count of characters processed (output) ; R8 is a pointer to the local output string copy (copy of address) ; R7 is a pointer to the local input string copy (copy of address) ; R6 is the next character in the input string ; R5 is a temp. value, then counter on leading silent pair processing ; R4 is the Soundex value of the current character ; R3 is used to count number of characters processed (input) ; R0 is the standard register to hold a return status (not saved or preserved) .PSECT C_ODE, LONG, CON, REL, LCL, SHR, EXE, RD, NOWRT ; Offsets on AP to arguments IN_DESC = 4 OUT_DESC = 8 .ENTRY SOUNDEX_DESC, ^M TSTL IN_DESC(AP) ; Check address of input string descrip. BNEQ 10$ ; branch if not zero (null) address MOVL #SS$_BADPARAM, R0 ; indicate bad parameter RET ; and return .ALIGN LONG 10$: TSTL OUT_DESC(AP) ; Check address of output string descrip. BNEQ 20$ ; branch if not zero (null) address MOVL #SS$_INSFARG, R0 ; indicate not enough parameters RET ; and return .ALIGN LONG 20$: MOVL #^A'0000', OUT_BUF ; Clear (put '0000' in) output string CLRL R9 ; characters copied so far is zero CLRL R11 ; insure space for input chars. is clear CLRL R6 ; same for next character CLRL R10 ; and previous character ; We need to obtain the character string from the caller. This ; would normally require checking access, checking the type of ; string, etc. Instead, we will use an RTL string routine which ; will get the input data and convert it to a fixed string type. ; If the copy fails we can simply pass the failure code back to ; the caller. We will also convert everything to upper case, as ; Soundex is case-insesitive and it makes subsequent comparisons ; easier. MOVW #IN_COPY_LEN, IN_DSC_LEN ; initialize copy area ; 1.9 MOVAB IN_BUF, IN_DSC_ADR ; " PUSHL IN_DESC(AP) ; address of input descriptor PUSHAQ IN_DSC_LEN ; address of local copy CALLS #2, G^STR$UPCASE ; copy with truncate/pad as needed CMPL R0, #SS$_NORMAL ; did it work? BEQL 30$ ; branch if yes CMPL R0, #STR$_TRU ; did it work but truncate input? BEQL 30$ ; branch if yes, that's O.K. too RET ; otherwise return with error to our caller .ALIGN LONG 30$: MOVL IN_DSC_ADR, R7 ; "address" of copied string MOVZWL (R7)+, R3 ; get length of copied string BGTR 40$ ; branch if we have something to process MOVL #SS$_NORMAL, R0 ; nothing to do, quit RET .ALIGN LONG 40$: MOVAB OUT_BUF, R8 ; Address of output work space ; Start processing input characters GET_CHAR: DECL R3 ; count down one character input BGTR 50$ ; branch if there are characters to process BRW RETURN_OUTPUT ; otherwise, time to quit. .ALIGN LONG 50$: MOVB (R7)+, R11 ; Get the next input character CVTBL SX_LETTERS[R11], R4 ; get soundex value BGEQ GOT_ALPHA ; branch if it's a letter ; This is a non-alpha character: we want to skip over all ; punctuation characters, all digits (numbers), and all leading ; characters including control characters and spaces, until the ; first printing character is found. After that: we skip ; punctuation and letters, but stop processing if we hit a ; terminating character (Blank, Null, LF, CR, FF, etc.) as defined ; in the Soundex value table. TSTL R9 ; Is this the first character? BEQL GET_CHAR ; branch if yes, skip everything CMPL R4, #TERM ; is it a terminating character? BNEQ GET_CHAR ; branch if not, get another character BRW RETURN_OUTPUT ; otherwise leave .ALIGN LONG ; end of filtering input characters GOT_ALPHA: ; Special processing for two character pairs CMPL R3, #1 ; have we gotten to the last character? BLEQ NOT_FIRST ; branch if there isn't a next character MOVB (R7), R6 ; get the next character (don't increment) TSTB R6 ; is the next character a null? BEQL 55$ ; branch if yes. CVTBL SX_LETTERS[R6], R5 ; get soundex value of next char. BLSS 55$ ; if minus, can't be a letter ; Change 'PH' to 'F' CMPB R11, #^A'P' ; is the current character 'P'? BNEQ 60$ ; branch if not. CMPB R6, #^A'H' ; is the next character 'H'? BNEQ 60$ ; branch if not. MOVB #^A'F', R11 ; Replace current character with 'F' CVTBL SX_LETTERS[R11], R4 ; get soundex value of 'F' MOVB (R7)+, R6 ; skip over 'H' to next character 55$: TSTL R9 ; is this the beginning of the word? BEQL MOVE_FIRST ; if yes, do special processing BRB NOT_FIRST ; otherwise, just another letter .ALIGN LONG 60$: TSTL R9 ; is this the beginning of the word? BEQL 70$ ; if yes, need to check leading silents BRB NOT_FIRST ; otherwise no special processing .ALIGN LONG ; Ignore the first letter in a silent-pair ; Start with the number of characters to check (-1 as offset is from zero) 70$: MOVL #, R5 80$: CMPB R11, LEADING_SILENT[R5] ; does character match leading_silent? BNEQ 90$ ; branch if no match CMPB R6, NEXT_SILENT[R5] ; does next char. match next_silent? BNEQ 90$ ; branch if no match ; if we get here, both match MOVB (R7)+, R11 ; 2nd character becomes 1st char. CVTBL SX_LETTERS[R11], R4 ; get soundex value of new char. BRB MOVE_FIRST ; and get out of the loop early .ALIGN LONG 90$: SOBGEQ R5, 80$ ; continue until all tested ; end special for first letter ; end special two character MOVE_FIRST: MOVB R11, (R8)+ ; first character in is first character out INCL R9 ; we have one char. (R9 should have been 0) BRB END_CHAR .ALIGN LONG NOT_FIRST: ; soundex value should already be in R4 TSTL R4 ; check the Soundex value BEQL END_CHAR ; if it's S_V (zero), skip it CMPB R11, R10 ; if it's same as previous character BEQL END_CHAR ; also skip, otherwise it's O.K. MOVB R4, (R8)+ ; move value to output INCL R9 ; count number of characters ; end of character processing END_CHAR: CMPL R9, #OUT_COPY_LEN ; have we obtained enough output characters ? BEQL RETURN_OUTPUT ; if yes, time to leave MOVB R11, R10 ; Current char. becomes the previous character BRW GET_CHAR ; and get the next character .ALIGN LONG ; Return the computed Soundex string back to user. As with the input, ; we will use an RTL string copy routine to avoid having to test ; access, convert string types, etc. Notice that, although the user ; ought to allocate a string which is at least 4 characters long, I'm ; going to force a truncate on output to be a sucessful result. RETURN_OUTPUT: PUSHAQ OUT_DSC_LEN ; address of local copy PUSHL OUT_DESC(AP) ; address of output descriptor CALLS #2, G^STR$COPY_DX ; copy with truncate/pad as needed CMPL R0, #STR$_TRU ; did it work but truncate input? BNEQ 100$ ; branch if no, everything else is ; passed back to the caller as-is. MOVL #SS$_NORMAL, R0 ; we will ignore truncate on output 100$: RET ; return result of copy to our caller .END