/* file: ibs.c Albert Yang, MD 25 August 2004 0.9 . Last revised: 26 October 2004 (GBM) 1.0 _______________________________________________________________________________ ibs: calculates information-based similarity index (IBS) between two data sets This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. You may contact the author by e-mail (ccyang@physionet.org/). For updates to this software, please visit PhysioNet (http://www.physionet.org/). _______________________________________________________________________________ Revision history: 0.9 (25-Aug-2004, Albert Yang) Original version 1.0 (26-Oct-2004, George Moody) Removed limits on input series length, fixed bugs that caused crashes for large values of M, memory leaks, loss of precision in index calculations TODO: collect word frequency histograms while reading the input series to reduce memory requirements; check for and avoid overflow in histograms Compile this program by gcc -o ibs -O ibs.c -lm This program reads two text files of numbers, which are interpreted as values of two time series. Within each series, pairs of consecutive values are compared to derive a binary series, which has values that are either 1 (if the second value of the pair was greater than the first) or 0 (otherwise). A user-specified parameter, M, determines the length of "words" (M-tuples) to be analyzed by this progam. Within each binary series (bs1 and bs2), all M-tuples of consecutive values are treated as "words"; the function counts the occurrences of each of the 2**M possible "words" and then derives the word rank order frequency (WROF) list for the series. Finally, it calculates the information-based similarity between the two WROF lists, and outputs this number. Depending on the input series and on the choice of M, the value of the index can vary between 0 (completely dissimilar) and 1 (identical). */ #include #include #include /* Sort a word frequency histogram in order to derive a WROF list */ void quicksort(int *array[2], int left, int right, int ic) { if (left < right) { int i = left-1, j = right+1, pivot = array[ic][left], temp; while (i < j) { while (array[ic][--j] > pivot) ; while (array[ic][++i] < pivot) ; if (i < j) { /* swap ith and jth elements of array */ temp = array[0][i]; array[0][i] = array[0][j]; array[0][j] = temp; temp = array[1][i]; array[1][i] = array[1][j]; array[1][j] = temp; } } quicksort(array, left, j, ic); quicksort(array, j+1, right, ic); } } char *pname; /* the name of this program (main's argv[0]) */ void help() { fprintf(stderr, "usage: %s M SERIES1 SERIES2\n", pname); fprintf(stderr, " where M is the word length (an integer greater than 1), and\n" " SERIES1 and SERIES2 are one-column text files containing the\n" " data of the two series that are to be compared. The output\n" " is the information-based similarity index of the input series\n" " evaluated for M-tuples (words of length M).\n\n" " For additional information, see http://physionet.org/physiotools/ibs/.\n"); } char *bs1, *bs2; /* binary series */ FILE *RF1, *RF2; /* input file pointers */ int *temp, *w1[2], *w2[2]; /* workspace for histograms and WROF lists */ /* This function releases all dynamically allocated memory, closes the input files, and exits this program (it does not return to the caller). */ void cleanup(int exitcode) { if (RF1) fclose(RF1); if (RF2) fclose(RF2); if (bs1) free(bs1); if (bs2) free(bs2); if (temp) free(temp); if (w1[0]) free(w1[0]); if (w1[1]) free(w1[1]); if (w2[0]) free(w2[0]); if (w2[1]) free(w2[1]); exit(exitcode); } int main(int argc, char **argv) { double dr, f1, f2, p, pi, pj, x1, x2; int i, j, m, mtuple, ns1, ns2, nw; long maxdat = 0L, nlin1 = 0L, nlin2 = 0L; /* Read the argument list. */ pname = argv[0]; if ((argc < 4) || (m = atoi(argv[1])) < 2) { help(); exit(1); } if (m >= (sizeof(int)*8)) { fprintf(stderr, "%s: insufficient memory (try a smaller M)\n", pname); exit(1); } /* Allocate and initialize workspace for word histograms and WROF lists. */ nw = 1 << m; if ((temp = calloc(nw, sizeof(int))) == NULL || (w1[0] = calloc(nw, sizeof(int))) == NULL || (w1[1] = calloc(nw, sizeof(int))) == NULL || (w2[0] = calloc(nw, sizeof(int))) == NULL || (w2[1] = calloc(nw, sizeof(int))) == NULL) { fprintf(stderr, "%s: insufficient memory (try a smaller M)\n", pname); cleanup(1); } for (i = 0; i < nw; i++) w1[0][i] = w2[0][i] = i; /* Open the input files. */ if ((RF1 = fopen(argv[2], "r")) == NULL) { fprintf(stderr, "%s: can't open %s\n", pname, argv[2]); cleanup(1); } if ((RF2 = fopen(argv[3], "r")) == NULL) { fprintf(stderr, "%s: can't open %s\n", pname, argv[3]); cleanup(1); } /* Read input series 1. */ fscanf(RF1, "%f", &x1); while (fscanf(RF1, "%f", &x2) == 1) { if (nlin1 >= maxdat) { char *pbs; maxdat += 50000; /* allow bs1 to grow (the increment is arbitrary) */ if ((pbs = realloc(bs1, maxdat * sizeof(char))) == NULL) { fprintf(stderr, "%s: insufficient memory for %s\n", pname, argv[2]); cleanup(1); } bs1 = pbs; } bs1[nlin1++] = (x2 > x1) ? 1 : 0; x1 = x2; } if (nlin1 < m) { fprintf(stderr, "%s: input series %s is too short (< %d values)\n", pname, argv[2], m); cleanup(1); } /* Read input series 2. */ maxdat = 0L; fscanf(RF2, "%f", &x1); while (fscanf(RF2, "%f", &x2) == 1) { if (nlin2 >= maxdat) { char *pbs; maxdat += 50000; /* allow bs2 to grow (the increment is arbitrary) */ if ((pbs = realloc(bs2, maxdat * sizeof(char))) == NULL) { fprintf(stderr, "%s: insufficient memory for %s\n", pname, argv[3]); cleanup(1); } bs2 = pbs; } bs2[nlin2++] = (x2 > x1) ? 1 : 0; x1 = x2; } if (nlin2 < m) { fprintf(stderr, "%s: input series %s is too short (< %d values)\n", pname, argv[3], m); cleanup(1); } /* Accumulate word frequency histograms. */ for (i = 0, f1 = ns1 = nlin1 - m + 1; i < ns1; i++) { mtuple = 0; for (j = 1 ; j <= m ; j++){ mtuple = mtuple + (1 << (m-j))*bs1[i+j-1]; } w1[1][mtuple]++; } for (i = 0, f2 = ns2 = nlin2 - m + 1; i < ns2; i++) { mtuple = 0; for (j = 1 ; j <= m ; j++){ mtuple = mtuple + (1 << (m-j))*bs2[i+j-1]; } w2[1][mtuple]++; } /* Sort histograms to obtain WROF lists. At this point, w1[0] contains word values, and w1[1] contains word counts (e.g., w1[0][5] = 5, and w1[1][5] contains the number of times that 5 occurs in bs1). */ for (i = 0; i < nw; i++) temp[i] = w1[1][i]; /* save w1[1] (word counts) */ quicksort(w1, 0, nw - 1, 1); /* sort bins by word counts */ for (i = 0; i < nw; i++) w1[1][i] = nw - i - 1; /* store word rank in w1[1] */ quicksort(w1, 0, nw - 1, 0); /* restore order by word value */ for (i = 0; i < nw; i++) w1[0][i] = temp[i]; /* replace word values with ranks */ /* Repeat for w2. */ for (i = 0; i < nw; i++) temp[i] = w2[1][i]; quicksort(w2, 0, nw - 1, 1); for (i = 0; i < nw; i++) w2[1][i] = nw - i - 1; quicksort(w2, 0, nw - 1, 0); for (i = 0 ; i < nw; i++) w2[0][i] = temp[i]; /* Calculate similarity index. */ for (dr = p = i = 0; i < nw; i++) { pi = w1[0][i]/f1; /* pi = probability of finding i by choosing a random word from bs1 */ pj = w2[0][i]/f2; /* pj = probability of finding i by choosing a random word from bs2 */ if (pi > 0) pi = -pi * log(pi); /* Shannon entropy of word i in bs1 */ else pi = 0; if (pj > 0) pj = -pj * log(pj); else pj = 0; dr += abs(w1[1][i] - w2[1][i]) * (pi + pj); p += pi + pj; } if (p > 0) dr /= (p * (nw-1)); else dr = -1; /* This should never happen! (It might occur if if we had not already checked to see that at least m values were available in each binary series.) */ printf("%lf\n", dr); cleanup(0); }