// copyfind.cpp : Compares the text in a set of documents pairwise, looking for matching phrases.
//
// Version 1.2
//
// Copyright (C) 2002 Louis A. Bloomfield
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
// Louis A. Bloomfield, Department of Physics, University of Virginia
// 382 McCormick Road, P.O. Box 400714, Charlottesville, VA 22904-4714
// email: bloomfield@virginia.edu
// web site: plagiarism.phys.virginia.edu (see this web site for latest version of copyfind)
//
// If you significantly improve this program, please let me know about it and I
// will consider distributing your version from my web site.
//
//#include "stdafx.h" // used by Microsoft Visual C++ only
#include
#include
#include
void heapsort(unsigned long *tableA,int *tableB,int n);
int docgetchar(FILE *fdocp,int *docbuffer,int *docbuffercount,int minstring);
void docprint(FILE *fdocp,FILE *fhtmlp,int *match,long wordcount,int minstring);
void quitprogram(int value);
class document // data structure for a document
{
public: // data is not hidden so as to maximize speed
document(); // a constructor method
~document(); // a destructor method
char docname[256]; // an entry for the document name
unsigned long *pwordhash; // a pointer to the hash-coded word list
unsigned long *pswordhash; // a pointer to the sorted hash-coded word list
int *pswordnum; // a pointer to the sorted word number list
int words; // an entry for the number of words in the lists
int firsthash; // an entry for the first word with more than 3 chars
};
document::document() // the constructor method
{
pwordhash=NULL; // hash-coded word list not allocated yet
pswordhash=NULL; // sorted hash-coded word list not allocated yet
pswordnum=NULL; // sorted word number list not allocated yet
}
document::~document() // the destructor method
{
if( pwordhash != NULL ) delete [] pwordhash; // if allocated, delete hash-coded word list
if( pswordhash != NULL ) delete [] pswordhash; // if allocated, delete sorted hash-coded word list
if( pswordnum != NULL ) delete [] pswordnum; // if allocated, delete sorted word number list
}
// Program: copyfind
// Purpose: Searches through a set of documents for pairs that share identical text
// Description: Reads in documents listed in file "doclist.txt", then looks through them pairwise for matching
// text in phrases of at least a specified minimum length. When it find more than a threshold
// value of matching text, it generates two html files. These html files contain the text of
// the two documents, with the matching text underlined. The program also generates a file
// "comparisons.txt" that lists the number of matching words in each pair of documents.
// Details: Reads in the list of documents and dynamically allocates a table of document entries, one for
// each document. It then reads in each document, converts each word to a 32-bit hash-coded value
// and dynamically allocates space to store those lists of hash-coded word values. Each document
// entry ends up with pointers to (1) a sequential list of the hash-coded word values, (2) a sorted
// list of those same hash-coded word values, and (3) a list of the word numbers associated with
// the sorted hash-coded word values (so that the program can figure out where each of the
// sorted hash-coded words actually appears in the document).
// After it reads in all the documents and creates the hash-coded lists, the program begins to
// compare documents. It selects two documents, left and right, and goes through their sorted
// hash-coded word lists, beginning with the first entries for words of more than 3 characters.
// It ignores words that don't appear in both documents. When it finds words common to both
// documents, it searches around those words in the normal-order hash-coded word lists to see
// how long the matching phrases are. If they are long enough, it marks the matches in its lists.
// The program must treat redundant words carefully--words that appear more than once in either or
// both documents. For each copy of a redundant word in the left document, it looks through all
// copies of that redundant word in the right document. In general, the right document's counter
// changes first and recycles when redundant words are encountered.
// If the program finds more than the threshold number of matching words in two documents, it
// generates two html files. It embeds the document text in each file, with html codes to
// underline the matching text.
int main(int argc, char* argv[]) // main program
{
FILE *fdocp; // handle for document file (input)
FILE *fhtmlp; // handle for html file (output)
FILE *fcomp; // handle for comparisons report file (output)
int docs,doccount; // total number of documents and running count
int words,wordcount; // total number of words in document and running count
int lcount; // general use counter, for local use only
char dstring[255]; // character buffer for document name strings
int ch; // current character for document input process
int charcount; // character running count
unsigned long inhash; // hash-coded version of a document word
int docL,docR; // document number of left document and right document
int wordcountL,wordcountR; // word running count for left document and right document
int firstL,firstR; // first matching word in left document and right document
int lastL,lastR; // last matching word in left document and right document
int matchedwordsL,matchedwordsR; // total matched words in left document and right document
int wordcountRsave; // saved copy of wordcountR, for redundant words
int docbuffer[100]; // buffer for getting document characters
int docbuffercount; // number of characters in document buffer
int docmax=1000; // number of document entries allocated
int docinc=1000; // step size when allocating more document entries
int wordmax=1000; // number of word entries allocated
int wordinc=1000; // step size when allocating more word entries
document *pdoc,*pxdoc; // normal and temporary document entry pointers
unsigned long *pwordhash,*pxwordhash; // normal and temporary hash-coded word list pointers
int *matchL,*matchR; // left and right matched word list pointers
int comparecount; // running count of documents
int inputvalue; // temporary input value
int phraselength=6; // length of shortest phrase to match
int minstring=10; // length of shortest text string to accept
int wordthreshold=500; // minimum count of matching words to report
int firstnewdoc=1; // first document that hasn't been tested already
printf("%s%s%s%s","Copyfind version 1.2, Copyright (C) 2002 Louis A. Bloomfield\n\n",
"Copyfind comes with ABSOLUTELY NO WARRANTY. This is free software,\n",
"and you are welcome to redistribute it under certain conditions.\n",
"For details, see http://www.gnu.org/copyleft/gpl.html\n\n");
inputvalue=0;
while( inputvalue<=0 )
{
printf("%s","Enter minimum number of words in a phrase\n (6 is suggested choice): ");
scanf("%d",&inputvalue);
}
phraselength=inputvalue;
inputvalue=0;
while( inputvalue<=0 )
{
printf("%s","Enter minimum number of characters to consider as text\n (10 is suggested choice): ");
scanf("%d",&inputvalue);
}
minstring=inputvalue;
inputvalue=0;
while( inputvalue<=0 )
{
printf("%s","Enter minimum number of matching words to report\n (500 is a suggested choice): ");
scanf("%d",&inputvalue);
}
wordthreshold=inputvalue;
inputvalue=0;
while( inputvalue<=0 )
{
printf("%s","Enter number of first new document\n (usually 1, unless you have checked before): ");
scanf("%d",&inputvalue);
}
firstnewdoc=inputvalue;
printf("%s%d%s%d%s%d%s%d%s","\nPhrase length = ",phraselength, " words\nText string >= ",
minstring," characters\nReport Threshold = ",wordthreshold," word\nFirst New Document = ",
firstnewdoc,"\n\n");
printf("%s","Loading and Hashing Documents\n"); // first step: load documents and hash them
if((fcomp=fopen("comparisons.txt", "w")) == NULL) // open comparison report file
{
printf("Cannot open comparisons.txt file.\n"); // if failed, report
quitprogram(1); // and quit program
}
if((fdocp=fopen("doclist.txt", "r")) == NULL) // open list of documents
{
printf("Cannot open doclist.txt file.\n"); // if failed, report
quitprogram(1); // and quit program
}
if( (pdoc = new document[docmax]) == NULL ) // allocate the maximum number of document entries
{
printf("Could not allocate enough memory for the document list.\n");
quitprogram(1);
}
doccount=0; // set count of documents to zero
while(!feof(fdocp)) // loop until the list of documents is finished
{
if (doccount==docmax) // if the document entries are full
{
if( (pxdoc = // allocate a larger array of entries
new document[docmax+docinc]) == NULL )
{
printf("Could not allocate enough memory for the document list.\n");
quitprogram(1);
}
for(lcount=0;lcount0x20) && (ch<=0x7E) ) // if character is normal text,
if( (ch>0x20) && (ch!=0x7F) && (ch!=0xFF) ) // if character is normal text, (including non-English characters)
{
inhash= // xor into the rotateleft(7) of inhash
((inhash << 7)|(inhash >> 23))^ch;
charcount++; // and increment the count of characters in the word
}
else // if not a printing character
{
if( charcount>0 ) // if there is a word underway (nonzero length),
{
if (wordcount==wordmax) // if hash-coded word entries are full
{
if( (pxwordhash = new unsigned long[wordmax+wordinc]) == NULL )
{ // allocate new, larger array of entries
printf("Could not allocate enough memory for the word data array.\n");
quitprogram(1);
}
for(lcount=0;lcount3 letter word
fclose(fdocp); // close the document
}
if( (matchL = new int[wordmax]) == NULL ) // allocate array for left match markers
{
printf("Could not allocate enough memory for the word matching arrays.\n");
quitprogram(1);
}
if( (matchR = new int[wordmax]) == NULL ) // allocate array for right match markers
{
printf("Could not allocate enough memory for the word matching arrays.\n");
quitprogram(1);
}
printf("%s","Comparing Documents\n"); // second, compare documents
if(firstnewdoc<2) // if all documents are considered new,
{
firstnewdoc=2; // start comparisons with the second document
}
comparecount=0;
for(docL=firstnewdoc-1;docL3 letter word
wordcountR=pdoc[docR].firsthash; // start right at first >3 letter word
wordcountRsave=wordcountR; // prepare right redundant word pointer
matchedwordsL=0; // zero count of left matched words
matchedwordsR=0; // zero count of right matched words
while ( (wordcountL < pdoc[docL].words) // loop while there are still words to check
&& (wordcountR < pdoc[docR].words) )
{
// if the next word in the left sorted hash-coded list has been matched
if( matchL[pdoc[docL].pswordnum[wordcountL]] != 0 )
{
wordcountL++; // advance to next left word
continue;
}
// if the next word in the right sorted hash-coded list has been matched
if( matchR[pdoc[docR].pswordnum[wordcountR]] != 0 )
{
wordcountR++; // skip to next right sorted hash-coded word
continue;
}
// check for left word less than right word
if( pdoc[docL].pswordhash[wordcountL] < pdoc[docR].pswordhash[wordcountR] )
{
wordcountL++; // advance to next left word
if ( wordcountL >= pdoc[docL].words) break;
if ( pdoc[docL].pswordhash[wordcountL] == pdoc[docL].pswordhash[wordcountL-1] )
{
wordcountR=wordcountRsave;
}
else
{
wordcountRsave=wordcountR;
}
continue; // and resume looping
}
// check for right word less than left word
if( pdoc[docL].pswordhash[wordcountL] > pdoc[docR].pswordhash[wordcountR] )
{
wordcountR++; // advance to next right word
wordcountRsave=wordcountR; // set pointer back to top of redundant words
continue; // and resume looping
}
// we have a match, so look up and down the hash-coded (not sorted) lists for matches
firstL=pdoc[docL].pswordnum[wordcountL]-1; // start left just before current word
lastL=pdoc[docL].pswordnum[wordcountL]+1; // end left just after current word
firstR=pdoc[docR].pswordnum[wordcountR]-1; // start right just before current word
lastR=pdoc[docR].pswordnum[wordcountR]+1; // end right just after current word
while( (firstL >= 0) && (firstR >= 0) ) // if both starting words match,
{
// make sure that left and right words haven't been used in a match before and
// that the two words actually match. If so, move up another word and repeat the test.
if( matchL[firstL] != 0 ) break;
if( matchR[firstR] != 0 ) break;
if( pdoc[docL].pwordhash[firstL] != pdoc[docR].pwordhash[firstR] ) break;
firstL--;
firstR--;
}
while( (lastL < pdoc[docL].words) && (lastR < pdoc[docR].words) ) // if both end words match
{
// make sure that the left and right words haven't been used in a match before and
// that the two words actually match. If so, move down another word and repeat the test.
if( matchL[lastL] != 0 ) break;
if( matchR[lastR] != 0 ) break;
if( pdoc[docL].pwordhash[lastL] != pdoc[docR].pwordhash[lastR] ) break;
lastL++;
lastR++;
}
firstL++; // correct for overshoot
firstR++; // correct for overshoot
lastL--; // correct for overshoot
lastR--; // correct for overshoot
if( lastL-firstL+1 >= phraselength ) // check that phrase is long enough to mark
{
for(lcount=firstL;lcount<=lastL;lcount++) // loop for all left matched words
{
matchL[lcount]=1; // mark those words as matched
}
for(lcount=firstR;lcount<=lastR;lcount++) // loop for all right matched words
{
matchR[lcount]=1; // mark those words as matched
}
matchedwordsL += lastL-firstL+1; // update count of left matched words
matchedwordsR += lastR-firstR+1; // update count of right matched words
}
wordcountR++; // skip to next right sorted hash-coded word
}
// report number of matching words in the two documents
fprintf(fcomp,"%d\t%d\t%s\t%s\n",matchedwordsL,matchedwordsR,pdoc[docL].docname,pdoc[docR].docname);
comparecount++; // increment count of comparisons
if( (comparecount%1000) == 0 ) // if count is divisible by 1000,
{
printf("%s%d%s","\r",comparecount," document comparisons completed"); // report count
}
if(matchedwordsL>=wordthreshold) // if there are enough matches to report,
{
strcpy(dstring,pdoc[docL].docname); // assemble html file name for left file
strncat(dstring,".",80);
strncat(dstring,pdoc[docR].docname,80);
strncat(dstring,".html",80);
if((fhtmlp=fopen(dstring,"w")) == NULL) // open html file
{
printf("Cannot open file ",dstring,"\n");
quitprogram(1);
}
// create header material for html file
fprintf(fhtmlp,"%s%s%s%s%s%d%s","Markup of ",pdoc[docL].docname,
" for Comparison with ",pdoc[docR].docname,
" (Matched Words = ",matchedwordsL,")\n");
if((fdocp=fopen(pdoc[docL].docname, "rb")) == NULL) // open left document
{
printf("%s%d%s%s%s","Cannot open document #",docL," = ",pdoc[docL].docname,"\n");
quitprogram(1);
}
// generate text body of html file, with matching words underlined
docprint(fdocp,fhtmlp,matchL,pdoc[docL].words,minstring);
fclose(fdocp); // close document
fprintf(fhtmlp,"%s","\n\n"); // complete html file
fclose(fhtmlp); // close html file
strcpy(dstring,pdoc[docR].docname); // assemble html file name for right file
strncat(dstring,".",80);
strncat(dstring,pdoc[docL].docname,80);
strncat(dstring,".html",80);
if((fhtmlp=fopen(dstring,"w")) == NULL) // open html file
{
printf("Cannot open file ",dstring,"\n"); // if failed, report
quitprogram(1); // and quit program
}
// create header material for html file
fprintf(fhtmlp,"%s%s%s%s%s%d%s","Markup of ",pdoc[docR].docname,
" for Comparison with ",pdoc[docL].docname,
" (Matched Words = ",matchedwordsL,")\n");
if((fdocp=fopen(pdoc[docR].docname, "rb")) == NULL) // open right document
{
printf("%s%d%s%s%s","Cannot open document #",docR," = ",pdoc[docR].docname,"\n");
quitprogram(1); // if failed, report and quit
}
// generate text body of html file, with matching words underlined
docprint(fdocp,fhtmlp,matchR,pdoc[docR].words,minstring);
fclose(fdocp); // close document
fprintf(fhtmlp,"%s","\n\n"); // complete html file
fclose(fhtmlp); // close html file
}
}
}
fclose(fcomp); // close comparisons file
printf("%s","\nComparisons Done\n"); // report completion
quitprogram(0); // and quit program
return 0;
}
// Function: heapsort
// Purpose: Sorts two tables together to put the first table in numerical order
// Details: Sorts both tableA and tableB together, putting tableA in numerical order and taking tableB
// with it.
// Description: An implementation of the classic heapsort algorithm. The first half of the routine puts
// the heap in order so that the value at the top of each two-branch node is greater than either of the two
// values in the branches below it. The second half of the routine takes values off the top of the heap
// and then resorts the heap so as to fill the opening and promote the remaining values. Ultimately, the
// entire heap is put into ascending numerical order
void heapsort(unsigned long *tableA,int *tableB,int n)
{
unsigned long tempA; // tempA can hold a hash-coded word temporarily
int tempB; // tempB can hold a word number temporarily
int nr; // number of remaining entries in the heap
int index1,index2,indexc,indexstart; // pointers into the heap
indexstart = (n >> 1); // start at the lowest two levels of the heap
index1 = (n >> 1); // index1 points to second-to-lowest level of heap
index2 = (index1 << 1); // index2 points to lowest level of heap
for ( indexc=indexstart;indexc>=1;indexc--) // loop and work backwards to top of heap
{
index1 = indexc; // pointer to the current node
index2 = (indexc << 1); // pointer to left branch below node
while ( index2 <= n ) // while we haven't gone off bottom of heap
{
if( index2 < n ) // if there are two values branching below this node,
{
if( tableA[index2] // choose the largest value
< tableA[index2+1] )
{
index2++; // and point to it
}
}
if( tableA[index1] < tableA[index2] ) // if the node is less than the largest value below,
{
tempA=tableA[index1]; // swap node and largest value below, part 1
tempB=tableB[index1]; // swap word number table entry to match, part 1
tableA[index1]=tableA[index2]; // swap part 2
tableB[index1]=tableB[index2]; // swap part 2
tableA[index2]=tempA; // swap part 3
tableB[index2]=tempB; // swap part 3
index1 = index2; // pointer to the next lower level of heap
index2 = (index2 << 1); // pointer to the left branch below that node
}
else // else, this node is properly ordered, so move along
{
break;
}
}
}
for ( nr=n-1;nr>=1;nr-- ) // start at end of heap and move backward to start
{
tempA=tableA[1]; // swap top node and value beyond end of shortened heap, part1
tempB=tableB[1]; // swap word number table entry to match, part 1
tableA[1]=tableA[nr+1]; // swap part 2
tableB[1]=tableB[nr+1]; // swap part 2
tableA[nr+1]=tempA; // swap part 3
tableB[nr+1]=tempB; // swap part 3
index1 = 1; // pointer to the current node (top of heap)
index2 = 2; // pointer to left branch below node
while ( index2 <= nr ) // while we haven't gone off bottom of heap
{
if( index2 < nr ) // if there are two values branching below this node,
{
if( tableA[index2] // choose the largest value
< tableA[index2+1] )
{
index2++; // and point to it
}
}
if( tableA[index1] < tableA[index2] ) // if the node is less than the largest value below,
{
tempA=tableA[index1]; // swap node and largest value below, part 1
tempB=tableB[index1]; // swap word number table entry to match, part 1
tableA[index1]=tableA[index2]; // swap part 2
tableB[index1]=tableB[index2]; // swap part 2
tableA[index2]=tempA; // swap part 3
tableB[index2]=tempB; // swap part 3
index1 = index2; // pointer to the next lower level of heap
index2 = (index2 << 1); // pointer to the left branch below that node
}
else // else, this node is properly ordered, so move along
{
break;
}
}
}
return; // both tables are all sorted, so return
}
// Function: docgetchar
// Purpose: Gets the next character from the document
// Details: Skips non-printing characters and underlength text strings that lie between non-printing
// characters. Converts smart-quotes to ordinary ones. Converts TAB characters to spaces and
// carriage returns to linefeeds.
// Description: Tries to maintain a buffer of text, LF, space, or EOF characters that is as long as the
// minimum length specification. It throws out non-printing characters and any text
// strings that aren;t minlength long. It works by filling the buffer whenever it's empty and
// keeping it full when possible. But as soon as it encounters a non-printing character, it
// lets the buffer evaporate to zero length. It then refills it, ignoring text strings that
// that aren't long enough to fill it completely.
int docgetchar(FILE *fdocp,int *docbuffer,int *docbuffercount,int minstring)
{
const int TAB=0x9; // The ASCII TAB character
const int LF=0xA; // The ASCII LF character (linefeed)
const int CR=0xD; // The ASCII CR character (carriage return)
const int SPACE=0x20; // The ASCII SPACED character
int ch,chout; // ch (current character), chout (output character)
int lcount; // count (a counter)
if( *docbuffercount==0 ) // if the buffer is empty
{
while(*docbuffercount < minstring) // loop until the buffer is full to the minimum string length
{
ch=fgetc(fdocp); // get the next character from the document
if( ch==0 ) // possibly a double-byte character,
{
ch=fgetc(fdocp); // so get second byte and continue
}
if( ch<0 ) // handle an eof
{
docbuffer[*docbuffercount]=ch; // put the eof in the the buffer
(*docbuffercount)++; // increment the number of characters in the buffer
break; // don't look for any more characters in the document
}
if( (ch==0x93) || (ch==0x94) ) ch=0x22; // convert fancy double-quotes to regular double-quotes
if( (ch==0x91) || (ch==0x92) ) ch=0x27; // convert fancy single-quotes to regular single-quotes
// if( (ch>0x20) && (ch<=0x7E) ) // handle normal text characters
if( (ch>0x20) && (ch!=0x7F) && (ch!=0xFF) ) // handle normal text characters, including non-English Characters
{
docbuffer[*docbuffercount]=ch; // store the normal text character in the buffer
(*docbuffercount)++; // increment the number of characters in the buffer
}
else if( ch==LF ) // handle a linefeed character (end of paragraph)
{
docbuffer[*docbuffercount]=LF; // put the linefeed character in the buffer
(*docbuffercount)++; // increment the number of characters in the buffer
}
else if( ch==SPACE ) // handle a space character
{
docbuffer[*docbuffercount]=SPACE; // put the space character in the buffer
(*docbuffercount)++; // increment the number of characters in the buffer
}
else if( ch==TAB ) // handle a tab character
{
docbuffer[*docbuffercount]=SPACE; // put a space character in the buffer instead of a tab
(*docbuffercount)++; // increment the number of characters in the buffer
}
else if( ch==CR ) // handle a carriage return character
{
docbuffer[*docbuffercount]=LF; // put a linefeed character in the buffer instead of a CR
(*docbuffercount)++; // increment the number of characters in the buffer
}
else // handle characters that aren't text characters
{
(*docbuffercount)=0; // truncate the buffer (text string less than minimum)
}
}
}
if( (*docbuffercount == minstring) && (docbuffer[(*docbuffercount)-1] >= 0 )) // if the buffer is full and there isn't an eof at the end
{
chout=docbuffer[0]; // prepare to return the first character in the buffer
for(lcount=0;lcount= 0) ) // loop until the buffer is full to the minimum string length
{
ch=fgetc(fdocp); // get another character from the document
if( ch==0 ) // possibly a double-byte character,
{
ch=fgetc(fdocp); // so get second byte and continue
}
if( (ch==0x93) || (ch==0x94) ) ch=0x22; // convert fancy double-quotes to regular double-quotes
if( (ch==0x91) || (ch==0x92) ) ch=0x27; // convert fancy single-quotes to regular single-quotes
// if( (ch<0) || ((ch>0x20) && (ch<=0x7E)) || // handle eof, text characters, linefeed, or space
if( (ch<0) || ((ch>0x20) && (ch!=0x7F) && (ch!=0xFF)) || // handle eof, text characters, linefeed, or space
(ch==LF) || (ch==SPACE) )
{
docbuffer[*docbuffercount]=ch; // put the character in the buffer
(*docbuffercount)++; // increment the number of characters in the buffer
}
else if( ch==TAB ) // handle a tab character
{
docbuffer[*docbuffercount]=SPACE; // put a space character in the buffer instead of a tab
(*docbuffercount)++; // increment the number of characters in the buffer
}
else if( ch==CR ) // handle a carriage return character
{
docbuffer[*docbuffercount]=LF; // put a linefeed character in the buffer instead of a CR
(*docbuffercount)++; // increment the number of characters in the buffer
}
else
{
break;
}
}
return chout; // return the character that was at the front of the buffer
}
else // if the buffer is only partly full
{
chout=docbuffer[0]; // prepare to return the first character in the buffer
for(lcount=0;lcount"); // turn on underlining
uline=1; // underlining is active
}
else if(uline==1 && (match[wordcount]!=1)) // have we encountered a nonmatch while underlining?
{
fprintf(fhtmlp,"%s",""); // turn off underlining
uline=0; // underlining is not active
}
int charcount=0; // we have no characters in our current word
while( true ) // loop until a break occurs
{
ch=docgetchar(fdocp,docbuffer, // get the next character from the document
&docbuffercount,minstring);
if( ch<0 ) // have we hit the eof?
{
break; // if so, break (we're done)
}
// if( (ch>0x20) && (ch<=0x7E) ) // is this an ordinary text character?
if( (ch>0x20) && (ch!=0x7F) && (ch!=0xFF) ) // is this an ordinary text character (including non-English)?
{
outstring[0]=(char)ch; // if so, prepare to output that character
outstring[1]='\0'; // terminate the string
fprintf(fhtmlp,"%s",outstring); // print the character to the html file
charcount++; // increase the number of characters in the current word
}
else if( ch==LF ) // have we hit a linefeed?
{
fprintf(fhtmlp,"%s","
\n"); // if so, insert a paragraph boundary into the html file
if( charcount>0 ) break; // if we had a word going, break to begin the next word
}
else if( charcount>0 ) // have we hit a nonprinting character while in a word?
{
fprintf(fhtmlp,"%s"," "); // if so, insert a blank into the html file
break; // break to begin the next word
} // if we aren't in a word and hit a nonprinting character, ignore it
}
}
}
void quitprogram(int value)
{
printf("%s","Press Enter to Quit Program\n");
fgetc(stdin);
exit(value);
}