// copyfind.cpp : Compares the text in a set of documents pairwise, looking for matching phrases. // // Version 1.2 // // Copyright (C) 2002 Louis A. Bloomfield // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // Louis A. Bloomfield, Department of Physics, University of Virginia // 382 McCormick Road, P.O. Box 400714, Charlottesville, VA 22904-4714 // email: bloomfield@virginia.edu // web site: plagiarism.phys.virginia.edu (see this web site for latest version of copyfind) // // If you significantly improve this program, please let me know about it and I // will consider distributing your version from my web site. // //#include "stdafx.h" // used by Microsoft Visual C++ only #include #include #include void heapsort(unsigned long *tableA,int *tableB,int n); int docgetchar(FILE *fdocp,int *docbuffer,int *docbuffercount,int minstring); void docprint(FILE *fdocp,FILE *fhtmlp,int *match,long wordcount,int minstring); void quitprogram(int value); class document // data structure for a document { public: // data is not hidden so as to maximize speed document(); // a constructor method ~document(); // a destructor method char docname[256]; // an entry for the document name unsigned long *pwordhash; // a pointer to the hash-coded word list unsigned long *pswordhash; // a pointer to the sorted hash-coded word list int *pswordnum; // a pointer to the sorted word number list int words; // an entry for the number of words in the lists int firsthash; // an entry for the first word with more than 3 chars }; document::document() // the constructor method { pwordhash=NULL; // hash-coded word list not allocated yet pswordhash=NULL; // sorted hash-coded word list not allocated yet pswordnum=NULL; // sorted word number list not allocated yet } document::~document() // the destructor method { if( pwordhash != NULL ) delete [] pwordhash; // if allocated, delete hash-coded word list if( pswordhash != NULL ) delete [] pswordhash; // if allocated, delete sorted hash-coded word list if( pswordnum != NULL ) delete [] pswordnum; // if allocated, delete sorted word number list } // Program: copyfind // Purpose: Searches through a set of documents for pairs that share identical text // Description: Reads in documents listed in file "doclist.txt", then looks through them pairwise for matching // text in phrases of at least a specified minimum length. When it find more than a threshold // value of matching text, it generates two html files. These html files contain the text of // the two documents, with the matching text underlined. The program also generates a file // "comparisons.txt" that lists the number of matching words in each pair of documents. // Details: Reads in the list of documents and dynamically allocates a table of document entries, one for // each document. It then reads in each document, converts each word to a 32-bit hash-coded value // and dynamically allocates space to store those lists of hash-coded word values. Each document // entry ends up with pointers to (1) a sequential list of the hash-coded word values, (2) a sorted // list of those same hash-coded word values, and (3) a list of the word numbers associated with // the sorted hash-coded word values (so that the program can figure out where each of the // sorted hash-coded words actually appears in the document). // After it reads in all the documents and creates the hash-coded lists, the program begins to // compare documents. It selects two documents, left and right, and goes through their sorted // hash-coded word lists, beginning with the first entries for words of more than 3 characters. // It ignores words that don't appear in both documents. When it finds words common to both // documents, it searches around those words in the normal-order hash-coded word lists to see // how long the matching phrases are. If they are long enough, it marks the matches in its lists. // The program must treat redundant words carefully--words that appear more than once in either or // both documents. For each copy of a redundant word in the left document, it looks through all // copies of that redundant word in the right document. In general, the right document's counter // changes first and recycles when redundant words are encountered. // If the program finds more than the threshold number of matching words in two documents, it // generates two html files. It embeds the document text in each file, with html codes to // underline the matching text. int main(int argc, char* argv[]) // main program { FILE *fdocp; // handle for document file (input) FILE *fhtmlp; // handle for html file (output) FILE *fcomp; // handle for comparisons report file (output) int docs,doccount; // total number of documents and running count int words,wordcount; // total number of words in document and running count int lcount; // general use counter, for local use only char dstring[255]; // character buffer for document name strings int ch; // current character for document input process int charcount; // character running count unsigned long inhash; // hash-coded version of a document word int docL,docR; // document number of left document and right document int wordcountL,wordcountR; // word running count for left document and right document int firstL,firstR; // first matching word in left document and right document int lastL,lastR; // last matching word in left document and right document int matchedwordsL,matchedwordsR; // total matched words in left document and right document int wordcountRsave; // saved copy of wordcountR, for redundant words int docbuffer[100]; // buffer for getting document characters int docbuffercount; // number of characters in document buffer int docmax=1000; // number of document entries allocated int docinc=1000; // step size when allocating more document entries int wordmax=1000; // number of word entries allocated int wordinc=1000; // step size when allocating more word entries document *pdoc,*pxdoc; // normal and temporary document entry pointers unsigned long *pwordhash,*pxwordhash; // normal and temporary hash-coded word list pointers int *matchL,*matchR; // left and right matched word list pointers int comparecount; // running count of documents int inputvalue; // temporary input value int phraselength=6; // length of shortest phrase to match int minstring=10; // length of shortest text string to accept int wordthreshold=500; // minimum count of matching words to report int firstnewdoc=1; // first document that hasn't been tested already printf("%s%s%s%s","Copyfind version 1.2, Copyright (C) 2002 Louis A. Bloomfield\n\n", "Copyfind comes with ABSOLUTELY NO WARRANTY. This is free software,\n", "and you are welcome to redistribute it under certain conditions.\n", "For details, see http://www.gnu.org/copyleft/gpl.html\n\n"); inputvalue=0; while( inputvalue<=0 ) { printf("%s","Enter minimum number of words in a phrase\n (6 is suggested choice): "); scanf("%d",&inputvalue); } phraselength=inputvalue; inputvalue=0; while( inputvalue<=0 ) { printf("%s","Enter minimum number of characters to consider as text\n (10 is suggested choice): "); scanf("%d",&inputvalue); } minstring=inputvalue; inputvalue=0; while( inputvalue<=0 ) { printf("%s","Enter minimum number of matching words to report\n (500 is a suggested choice): "); scanf("%d",&inputvalue); } wordthreshold=inputvalue; inputvalue=0; while( inputvalue<=0 ) { printf("%s","Enter number of first new document\n (usually 1, unless you have checked before): "); scanf("%d",&inputvalue); } firstnewdoc=inputvalue; printf("%s%d%s%d%s%d%s%d%s","\nPhrase length = ",phraselength, " words\nText string >= ", minstring," characters\nReport Threshold = ",wordthreshold," word\nFirst New Document = ", firstnewdoc,"\n\n"); printf("%s","Loading and Hashing Documents\n"); // first step: load documents and hash them if((fcomp=fopen("comparisons.txt", "w")) == NULL) // open comparison report file { printf("Cannot open comparisons.txt file.\n"); // if failed, report quitprogram(1); // and quit program } if((fdocp=fopen("doclist.txt", "r")) == NULL) // open list of documents { printf("Cannot open doclist.txt file.\n"); // if failed, report quitprogram(1); // and quit program } if( (pdoc = new document[docmax]) == NULL ) // allocate the maximum number of document entries { printf("Could not allocate enough memory for the document list.\n"); quitprogram(1); } doccount=0; // set count of documents to zero while(!feof(fdocp)) // loop until the list of documents is finished { if (doccount==docmax) // if the document entries are full { if( (pxdoc = // allocate a larger array of entries new document[docmax+docinc]) == NULL ) { printf("Could not allocate enough memory for the document list.\n"); quitprogram(1); } for(lcount=0;lcount0x20) && (ch<=0x7E) ) // if character is normal text, if( (ch>0x20) && (ch!=0x7F) && (ch!=0xFF) ) // if character is normal text, (including non-English characters) { inhash= // xor into the rotateleft(7) of inhash ((inhash << 7)|(inhash >> 23))^ch; charcount++; // and increment the count of characters in the word } else // if not a printing character { if( charcount>0 ) // if there is a word underway (nonzero length), { if (wordcount==wordmax) // if hash-coded word entries are full { if( (pxwordhash = new unsigned long[wordmax+wordinc]) == NULL ) { // allocate new, larger array of entries printf("Could not allocate enough memory for the word data array.\n"); quitprogram(1); } for(lcount=0;lcount3 letter word fclose(fdocp); // close the document } if( (matchL = new int[wordmax]) == NULL ) // allocate array for left match markers { printf("Could not allocate enough memory for the word matching arrays.\n"); quitprogram(1); } if( (matchR = new int[wordmax]) == NULL ) // allocate array for right match markers { printf("Could not allocate enough memory for the word matching arrays.\n"); quitprogram(1); } printf("%s","Comparing Documents\n"); // second, compare documents if(firstnewdoc<2) // if all documents are considered new, { firstnewdoc=2; // start comparisons with the second document } comparecount=0; for(docL=firstnewdoc-1;docL3 letter word wordcountR=pdoc[docR].firsthash; // start right at first >3 letter word wordcountRsave=wordcountR; // prepare right redundant word pointer matchedwordsL=0; // zero count of left matched words matchedwordsR=0; // zero count of right matched words while ( (wordcountL < pdoc[docL].words) // loop while there are still words to check && (wordcountR < pdoc[docR].words) ) { // if the next word in the left sorted hash-coded list has been matched if( matchL[pdoc[docL].pswordnum[wordcountL]] != 0 ) { wordcountL++; // advance to next left word continue; } // if the next word in the right sorted hash-coded list has been matched if( matchR[pdoc[docR].pswordnum[wordcountR]] != 0 ) { wordcountR++; // skip to next right sorted hash-coded word continue; } // check for left word less than right word if( pdoc[docL].pswordhash[wordcountL] < pdoc[docR].pswordhash[wordcountR] ) { wordcountL++; // advance to next left word if ( wordcountL >= pdoc[docL].words) break; if ( pdoc[docL].pswordhash[wordcountL] == pdoc[docL].pswordhash[wordcountL-1] ) { wordcountR=wordcountRsave; } else { wordcountRsave=wordcountR; } continue; // and resume looping } // check for right word less than left word if( pdoc[docL].pswordhash[wordcountL] > pdoc[docR].pswordhash[wordcountR] ) { wordcountR++; // advance to next right word wordcountRsave=wordcountR; // set pointer back to top of redundant words continue; // and resume looping } // we have a match, so look up and down the hash-coded (not sorted) lists for matches firstL=pdoc[docL].pswordnum[wordcountL]-1; // start left just before current word lastL=pdoc[docL].pswordnum[wordcountL]+1; // end left just after current word firstR=pdoc[docR].pswordnum[wordcountR]-1; // start right just before current word lastR=pdoc[docR].pswordnum[wordcountR]+1; // end right just after current word while( (firstL >= 0) && (firstR >= 0) ) // if both starting words match, { // make sure that left and right words haven't been used in a match before and // that the two words actually match. If so, move up another word and repeat the test. if( matchL[firstL] != 0 ) break; if( matchR[firstR] != 0 ) break; if( pdoc[docL].pwordhash[firstL] != pdoc[docR].pwordhash[firstR] ) break; firstL--; firstR--; } while( (lastL < pdoc[docL].words) && (lastR < pdoc[docR].words) ) // if both end words match { // make sure that the left and right words haven't been used in a match before and // that the two words actually match. If so, move down another word and repeat the test. if( matchL[lastL] != 0 ) break; if( matchR[lastR] != 0 ) break; if( pdoc[docL].pwordhash[lastL] != pdoc[docR].pwordhash[lastR] ) break; lastL++; lastR++; } firstL++; // correct for overshoot firstR++; // correct for overshoot lastL--; // correct for overshoot lastR--; // correct for overshoot if( lastL-firstL+1 >= phraselength ) // check that phrase is long enough to mark { for(lcount=firstL;lcount<=lastL;lcount++) // loop for all left matched words { matchL[lcount]=1; // mark those words as matched } for(lcount=firstR;lcount<=lastR;lcount++) // loop for all right matched words { matchR[lcount]=1; // mark those words as matched } matchedwordsL += lastL-firstL+1; // update count of left matched words matchedwordsR += lastR-firstR+1; // update count of right matched words } wordcountR++; // skip to next right sorted hash-coded word } // report number of matching words in the two documents fprintf(fcomp,"%d\t%d\t%s\t%s\n",matchedwordsL,matchedwordsR,pdoc[docL].docname,pdoc[docR].docname); comparecount++; // increment count of comparisons if( (comparecount%1000) == 0 ) // if count is divisible by 1000, { printf("%s%d%s","\r",comparecount," document comparisons completed"); // report count } if(matchedwordsL>=wordthreshold) // if there are enough matches to report, { strcpy(dstring,pdoc[docL].docname); // assemble html file name for left file strncat(dstring,".",80); strncat(dstring,pdoc[docR].docname,80); strncat(dstring,".html",80); if((fhtmlp=fopen(dstring,"w")) == NULL) // open html file { printf("Cannot open file ",dstring,"\n"); quitprogram(1); } // create header material for html file fprintf(fhtmlp,"%s%s%s%s%s%d%s","Markup of ",pdoc[docL].docname, " for Comparison with ",pdoc[docR].docname, " (Matched Words = ",matchedwordsL,")\n"); if((fdocp=fopen(pdoc[docL].docname, "rb")) == NULL) // open left document { printf("%s%d%s%s%s","Cannot open document #",docL," = ",pdoc[docL].docname,"\n"); quitprogram(1); } // generate text body of html file, with matching words underlined docprint(fdocp,fhtmlp,matchL,pdoc[docL].words,minstring); fclose(fdocp); // close document fprintf(fhtmlp,"%s","\n\n"); // complete html file fclose(fhtmlp); // close html file strcpy(dstring,pdoc[docR].docname); // assemble html file name for right file strncat(dstring,".",80); strncat(dstring,pdoc[docL].docname,80); strncat(dstring,".html",80); if((fhtmlp=fopen(dstring,"w")) == NULL) // open html file { printf("Cannot open file ",dstring,"\n"); // if failed, report quitprogram(1); // and quit program } // create header material for html file fprintf(fhtmlp,"%s%s%s%s%s%d%s","Markup of ",pdoc[docR].docname, " for Comparison with ",pdoc[docL].docname, " (Matched Words = ",matchedwordsL,")\n"); if((fdocp=fopen(pdoc[docR].docname, "rb")) == NULL) // open right document { printf("%s%d%s%s%s","Cannot open document #",docR," = ",pdoc[docR].docname,"\n"); quitprogram(1); // if failed, report and quit } // generate text body of html file, with matching words underlined docprint(fdocp,fhtmlp,matchR,pdoc[docR].words,minstring); fclose(fdocp); // close document fprintf(fhtmlp,"%s","\n\n"); // complete html file fclose(fhtmlp); // close html file } } } fclose(fcomp); // close comparisons file printf("%s","\nComparisons Done\n"); // report completion quitprogram(0); // and quit program return 0; } // Function: heapsort // Purpose: Sorts two tables together to put the first table in numerical order // Details: Sorts both tableA and tableB together, putting tableA in numerical order and taking tableB // with it. // Description: An implementation of the classic heapsort algorithm. The first half of the routine puts // the heap in order so that the value at the top of each two-branch node is greater than either of the two // values in the branches below it. The second half of the routine takes values off the top of the heap // and then resorts the heap so as to fill the opening and promote the remaining values. Ultimately, the // entire heap is put into ascending numerical order void heapsort(unsigned long *tableA,int *tableB,int n) { unsigned long tempA; // tempA can hold a hash-coded word temporarily int tempB; // tempB can hold a word number temporarily int nr; // number of remaining entries in the heap int index1,index2,indexc,indexstart; // pointers into the heap indexstart = (n >> 1); // start at the lowest two levels of the heap index1 = (n >> 1); // index1 points to second-to-lowest level of heap index2 = (index1 << 1); // index2 points to lowest level of heap for ( indexc=indexstart;indexc>=1;indexc--) // loop and work backwards to top of heap { index1 = indexc; // pointer to the current node index2 = (indexc << 1); // pointer to left branch below node while ( index2 <= n ) // while we haven't gone off bottom of heap { if( index2 < n ) // if there are two values branching below this node, { if( tableA[index2] // choose the largest value < tableA[index2+1] ) { index2++; // and point to it } } if( tableA[index1] < tableA[index2] ) // if the node is less than the largest value below, { tempA=tableA[index1]; // swap node and largest value below, part 1 tempB=tableB[index1]; // swap word number table entry to match, part 1 tableA[index1]=tableA[index2]; // swap part 2 tableB[index1]=tableB[index2]; // swap part 2 tableA[index2]=tempA; // swap part 3 tableB[index2]=tempB; // swap part 3 index1 = index2; // pointer to the next lower level of heap index2 = (index2 << 1); // pointer to the left branch below that node } else // else, this node is properly ordered, so move along { break; } } } for ( nr=n-1;nr>=1;nr-- ) // start at end of heap and move backward to start { tempA=tableA[1]; // swap top node and value beyond end of shortened heap, part1 tempB=tableB[1]; // swap word number table entry to match, part 1 tableA[1]=tableA[nr+1]; // swap part 2 tableB[1]=tableB[nr+1]; // swap part 2 tableA[nr+1]=tempA; // swap part 3 tableB[nr+1]=tempB; // swap part 3 index1 = 1; // pointer to the current node (top of heap) index2 = 2; // pointer to left branch below node while ( index2 <= nr ) // while we haven't gone off bottom of heap { if( index2 < nr ) // if there are two values branching below this node, { if( tableA[index2] // choose the largest value < tableA[index2+1] ) { index2++; // and point to it } } if( tableA[index1] < tableA[index2] ) // if the node is less than the largest value below, { tempA=tableA[index1]; // swap node and largest value below, part 1 tempB=tableB[index1]; // swap word number table entry to match, part 1 tableA[index1]=tableA[index2]; // swap part 2 tableB[index1]=tableB[index2]; // swap part 2 tableA[index2]=tempA; // swap part 3 tableB[index2]=tempB; // swap part 3 index1 = index2; // pointer to the next lower level of heap index2 = (index2 << 1); // pointer to the left branch below that node } else // else, this node is properly ordered, so move along { break; } } } return; // both tables are all sorted, so return } // Function: docgetchar // Purpose: Gets the next character from the document // Details: Skips non-printing characters and underlength text strings that lie between non-printing // characters. Converts smart-quotes to ordinary ones. Converts TAB characters to spaces and // carriage returns to linefeeds. // Description: Tries to maintain a buffer of text, LF, space, or EOF characters that is as long as the // minimum length specification. It throws out non-printing characters and any text // strings that aren;t minlength long. It works by filling the buffer whenever it's empty and // keeping it full when possible. But as soon as it encounters a non-printing character, it // lets the buffer evaporate to zero length. It then refills it, ignoring text strings that // that aren't long enough to fill it completely. int docgetchar(FILE *fdocp,int *docbuffer,int *docbuffercount,int minstring) { const int TAB=0x9; // The ASCII TAB character const int LF=0xA; // The ASCII LF character (linefeed) const int CR=0xD; // The ASCII CR character (carriage return) const int SPACE=0x20; // The ASCII SPACED character int ch,chout; // ch (current character), chout (output character) int lcount; // count (a counter) if( *docbuffercount==0 ) // if the buffer is empty { while(*docbuffercount < minstring) // loop until the buffer is full to the minimum string length { ch=fgetc(fdocp); // get the next character from the document if( ch==0 ) // possibly a double-byte character, { ch=fgetc(fdocp); // so get second byte and continue } if( ch<0 ) // handle an eof { docbuffer[*docbuffercount]=ch; // put the eof in the the buffer (*docbuffercount)++; // increment the number of characters in the buffer break; // don't look for any more characters in the document } if( (ch==0x93) || (ch==0x94) ) ch=0x22; // convert fancy double-quotes to regular double-quotes if( (ch==0x91) || (ch==0x92) ) ch=0x27; // convert fancy single-quotes to regular single-quotes // if( (ch>0x20) && (ch<=0x7E) ) // handle normal text characters if( (ch>0x20) && (ch!=0x7F) && (ch!=0xFF) ) // handle normal text characters, including non-English Characters { docbuffer[*docbuffercount]=ch; // store the normal text character in the buffer (*docbuffercount)++; // increment the number of characters in the buffer } else if( ch==LF ) // handle a linefeed character (end of paragraph) { docbuffer[*docbuffercount]=LF; // put the linefeed character in the buffer (*docbuffercount)++; // increment the number of characters in the buffer } else if( ch==SPACE ) // handle a space character { docbuffer[*docbuffercount]=SPACE; // put the space character in the buffer (*docbuffercount)++; // increment the number of characters in the buffer } else if( ch==TAB ) // handle a tab character { docbuffer[*docbuffercount]=SPACE; // put a space character in the buffer instead of a tab (*docbuffercount)++; // increment the number of characters in the buffer } else if( ch==CR ) // handle a carriage return character { docbuffer[*docbuffercount]=LF; // put a linefeed character in the buffer instead of a CR (*docbuffercount)++; // increment the number of characters in the buffer } else // handle characters that aren't text characters { (*docbuffercount)=0; // truncate the buffer (text string less than minimum) } } } if( (*docbuffercount == minstring) && (docbuffer[(*docbuffercount)-1] >= 0 )) // if the buffer is full and there isn't an eof at the end { chout=docbuffer[0]; // prepare to return the first character in the buffer for(lcount=0;lcount= 0) ) // loop until the buffer is full to the minimum string length { ch=fgetc(fdocp); // get another character from the document if( ch==0 ) // possibly a double-byte character, { ch=fgetc(fdocp); // so get second byte and continue } if( (ch==0x93) || (ch==0x94) ) ch=0x22; // convert fancy double-quotes to regular double-quotes if( (ch==0x91) || (ch==0x92) ) ch=0x27; // convert fancy single-quotes to regular single-quotes // if( (ch<0) || ((ch>0x20) && (ch<=0x7E)) || // handle eof, text characters, linefeed, or space if( (ch<0) || ((ch>0x20) && (ch!=0x7F) && (ch!=0xFF)) || // handle eof, text characters, linefeed, or space (ch==LF) || (ch==SPACE) ) { docbuffer[*docbuffercount]=ch; // put the character in the buffer (*docbuffercount)++; // increment the number of characters in the buffer } else if( ch==TAB ) // handle a tab character { docbuffer[*docbuffercount]=SPACE; // put a space character in the buffer instead of a tab (*docbuffercount)++; // increment the number of characters in the buffer } else if( ch==CR ) // handle a carriage return character { docbuffer[*docbuffercount]=LF; // put a linefeed character in the buffer instead of a CR (*docbuffercount)++; // increment the number of characters in the buffer } else { break; } } return chout; // return the character that was at the front of the buffer } else // if the buffer is only partly full { chout=docbuffer[0]; // prepare to return the first character in the buffer for(lcount=0;lcount"); // turn on underlining uline=1; // underlining is active } else if(uline==1 && (match[wordcount]!=1)) // have we encountered a nonmatch while underlining? { fprintf(fhtmlp,"%s",""); // turn off underlining uline=0; // underlining is not active } int charcount=0; // we have no characters in our current word while( true ) // loop until a break occurs { ch=docgetchar(fdocp,docbuffer, // get the next character from the document &docbuffercount,minstring); if( ch<0 ) // have we hit the eof? { break; // if so, break (we're done) } // if( (ch>0x20) && (ch<=0x7E) ) // is this an ordinary text character? if( (ch>0x20) && (ch!=0x7F) && (ch!=0xFF) ) // is this an ordinary text character (including non-English)? { outstring[0]=(char)ch; // if so, prepare to output that character outstring[1]='\0'; // terminate the string fprintf(fhtmlp,"%s",outstring); // print the character to the html file charcount++; // increase the number of characters in the current word } else if( ch==LF ) // have we hit a linefeed? { fprintf(fhtmlp,"%s","

\n

"); // if so, insert a paragraph boundary into the html file if( charcount>0 ) break; // if we had a word going, break to begin the next word } else if( charcount>0 ) // have we hit a nonprinting character while in a word? { fprintf(fhtmlp,"%s"," "); // if so, insert a blank into the html file break; // break to begin the next word } // if we aren't in a word and hit a nonprinting character, ignore it } } } void quitprogram(int value) { printf("%s","Press Enter to Quit Program\n"); fgetc(stdin); exit(value); }