Exclude a word if it is present in an array of words

Related searches

Considering this code that counts all occurrences, how do you remove common words?

For example, if the word is from the top 100 English words then, don't count that word.

If you take the most common 100 words according to Wikipedia, how do you add those to an array and check to not count them on the list: https://en.wikipedia.org/wiki/Most_common_words_in_English

Top 100 most common words in an array form:

#define NUMBER_OF_STRING 100
#define MAX_STRING_SIZE   50

char commonWords[NUMBER_OF_STRING][MAX_STRING_SIZE] = {"the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us"};

Code Example:

/**
 * C program to count occurrences of all words in a file.
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>

#define MAX_WORD  20000     /* max word size */
#define MAX_WORDS     8     /* initial number of struct to allocate */

#ifndef PATH_MAX
#define PATH_MAX   2048     /* max path (defined for Linux in limits.h) */
#endif

typedef struct {            /* use a struct to hold */
    char word[MAX_WORD];    /* lowercase word, and */
    int cap, count;         /* if it appeast capitalized, and its count */
} words_t;

char *strlwr (char *str)    /* no need for unsigned char */
{
    char *p = str;

    while (*p) {
        *p = tolower(*p);
        p++;
    }

    return str;
}

int main (void) {

    FILE *fptr;
    char path[PATH_MAX], word[MAX_WORD];
    size_t i, len, index = 0, max_words = MAX_WORDS;

    /* pointer to allocated block of max_words struct initialized zero */
    words_t *words = calloc (max_words, sizeof *words);
    if (!words) {   /* valdiate every allocation */
        perror ("calloc-words");
        exit (EXIT_FAILURE);
    }

    /* Input file path */
    printf ("Enter file path: ");
    if (scanf ("%s", path) != 1) {  /* validate every input */
        fputs ("error: invalid file path or cancellation.\n", stderr);
        return 1;
    }

    fptr = fopen (path, "r");   /* open file */
    if (fptr == NULL) {         /* validate file open */
        fputs ( "Unable to open file.\n"
                "Please check you have read privileges.\n", stderr);
        exit (EXIT_FAILURE);
    }

    while (fscanf (fptr, "%s", word) == 1) {  /* while valid word read */
        int iscap = 0, isunique = 1;    /* is captial, is unique flags */

        if (isupper (*word))            /* is the word uppercase */
            iscap = 1;

        /* remove all trailing punctuation characters */
        len = strlen (word);                    /* get length */
        while (len && ispunct(word[len - 1]))   /* only if len > 0 */
            word[--len] = 0;

        strlwr (word);                  /* convert word to lowercase */

        /* check if word exits in list of all distinct words */
        for (i = 0; i < index; i++) {
            if (strcmp(words[i].word, word) == 0) {
                isunique = 0;               /* set unique flag zero */
                if (iscap)                  /* if capital flag set */
                    words[i].cap = iscap;   /* set capital flag in struct */
                words[i].count++;           /* increment word count */
                break;                      /* bail - done */
            }
        }
        if (isunique) { /* if unique, add to array, increment index */
            if (index == max_words) {       /* is realloc needed? */
                /* always use a temporary pointer with realloc */
                void *tmp = realloc (words, 2 * max_words * sizeof *words);
                if (!tmp) { /* validate every allocation */
                    perror ("realloc-words");
                    break;  /* don't exit, original data still valid */
                }
                words = tmp;    /* assign reallocated block to words */
                /* (optional) set all new memory to zero */
                memset (words + max_words, 0, max_words * sizeof *words);
                max_words *= 2; /* update max_words to reflect new limit */
            }
            memcpy (words[index].word, word, len + 1);  /* have len */
            if (iscap)                      /* if cap flag set */
                words[index].cap = iscap;   /* set capital flag in struct */
            words[index++].count++;         /* increment count & index */
        }
    }
    fclose (fptr);  /* close file */

    /*
     * Print occurrences of all words in file.
     */
    puts ("\nOccurrences of all distinct words with Cap in file:");
    for (i = 0; i < index; i++) {
        if (words[i].cap) {
            strcpy (word, words[i].word);
            *word = toupper (*word);
            /*
             * %-15s prints string in 15 character width.
             * - is used to print string left align inside
             * 15 character width space.
             */
            printf("%-8d %s\n", words[i].count, word);
        }
    }
    free (words);

    return 0;
}

Text File to Test: (cars.txt)

A car (or automobile) is a wheeled motor vehicle used for transportation. Most definitions of car say they run primarily on roads, seat one to eight people, have four tires, and mainly transport people rather than goods.[2][3]

Cars came into global use during the 20th century, and developed economies depend on them. The year 1886 is regarded as the birth year of the modern car when German inventor Karl Benz patented his Benz Patent-Motorwagen. Cars became widely available in the early 20th century. One of the first cars accessible to the masses was the 1908 Model T, an American car manufactured by the Ford Motor Company. Cars were rapidly adopted in the US, where they replaced animal-drawn carriages and carts, but took much longer to be accepted in Western Europe and other parts of the world.

Cars have controls for driving, parking, passenger comfort, and a variety of lights. Over the decades, additional features and controls have been added to vehicles, making them progressively more complex. These include rear reversing cameras, air conditioning, navigation systems, and in-car entertainment. Most cars in use in the 2010s are propelled by an internal combustion engine, fueled by the combustion of fossil fuels. Electric cars, which were invented early in the history of the car, began to become commercially available in 2008.

There are costs and benefits to car use. The costs include acquiring the vehicle, interest payments (if the car is financed), repairs and maintenance, fuel, depreciation, driving time, parking fees, taxes, and insurance.[4] The costs to society include maintaining roads, land use, road congestion, air pollution, public health, health care, and disposing of the vehicle at the end of its life. Road traffic accidents are the largest cause of injury-related deaths worldwide.[5]

The benefits include on-demand transportation, mobility, independence, and convenience.[6] The societal benefits include economic benefits, such as job and wealth creation from the automotive industry, transportation provision, societal well-being from leisure and travel opportunities, and revenue generation from the taxes. People's ability to move flexibly from place to place has far-reaching implications for the nature of societies.[7] There are around 1 billion cars in use worldwide. The numbers are increasing rapidly, especially in China, India and other newly industrialized countries.[8]

Current output:

Occurrences of all distinct words with Cap in file:
3        A
2        Motor
2        Most
2        One
8        Cars
29       The
1        German
1        Karl
2        Benz
1        Patent-motorwagen
1        Model
1        T
1        American
1        Ford
1        Company
1        Us
1        Western
1        Europe
1        Over
1        These
1        Electric
2        There
2        Road
1        People's
1        China
1        India

Expected Output: (Example only)

2        Motor
1        German
1        Karl
2        Benz
1        Patent-motorwagen
1        Model
1        T
1        American
1        Ford
1        Company

EDIT Update: Possible Solutions:

  • while and continue (doesn't work)

    // skip the word if it is a common word
    for (int i = 0; i < NUMBER_OF_STRING; i++) {
        if (strcmp(word, commonWords[i])==0) {
            continue;
        }
    }
    

    A slightly more efficient way would be to use a single call to strstr rather than attempting to compare against every one of the top 100 most common words. Since you know the 100 most common words, and they will not change, you can easily determine the longest of the is 7-characters. In other words, you only need to test whether word is one of the most common if it is less than:

    #define TOP_LEN       8     /* longest string in TOP100 + nul-character */
    

    Since the words do not change, you can go ahead and:

    const char TOP100[] = " the be to of and a in that have i it for not on with"
                    " he as you do at this but his by from they we say her she or"
                    " an will my one all would there their what so up out if about"
                    " who get which go me when make can like time no just him know"
                    " take people into year your good some could them see other"
                    " than then now look only come its over think also back after"
                    " use two how our work first well way even new want because"
                    " any these give day most us ";
    

    (note: the space before and the space after each word which allows you to create a teststr to search for with strstr by including a space on either side of your word. 'I' has been converted to lowercase to work after your strlwr (word);)

    (also note: you could also use a constant literal with #define TOP100 " the ... us ", but it would wrap and scroll horribly off the page here -- up to you)

    With your constant string of the 100 most common words, the only addition needed is:

            ...
            strlwr (word);                  /* convert word to lowercase */
    
            /* check against 100 most common words (TOP100) */
            if (len < TOP_LEN) {                    /* word less than TOP_LEN? */
                char teststr[TOP_LEN * 2];          /* buffer for " word " */
                sprintf (teststr, " %s ", word);    /* create teststr */
                if (strstr (TOP100, teststr))       /* check if in TOP100 */
                    continue;                       /* if so, get next word */
            }
            ...
    

    You see above, you check if the word is 7-characters or less (otherwise there is no need to check against the most common). You then declare a teststr to hold you string with a space at each end. (since the longest common word in 7-char, then 7-char plus 2-spaces is 9-char, plus the nul-character is 10, so 16-char is more than adequate here.)

    A simple call to sprintf is all that is needed to put the spaces at each end of word, and then a single call to strstr is all that is needed to see if word is within the top 100 most common words. If it is, no need to go further, just continue and get the next word.

    Putting it altogether in your code you would have:

    /**
     * C program to count occurrences of all words in a file.
     */
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <ctype.h>
    #include <limits.h>
    
    #define MAX_WORD  20000     /* max word size */
    #define MAX_WORDS     8     /* initial number of struct to allocate */
    #define TOP_LEN       8     /* longest string in TOP100 */
    
    #ifndef PATH_MAX
    #define PATH_MAX   2048     /* max path (defined for Linux in limits.h) */
    #endif
    
    const char TOP100[] = " the be to of and a in that have i it for not on with"
                    " he as you do at this but his by from they we say her she or"
                    " an will my one all would there their what so up out if about"
                    " who get which go me when make can like time no just him know"
                    " take people into year your good some could them see other"
                    " than then now look only come its over think also back after"
                    " use two how our work first well way even new want because"
                    " any these give day most us ";
    
    typedef struct {            /* use a struct to hold */
        char word[MAX_WORD];    /* lowercase word, and */
        int cap, count;         /* if it appeast capitalized, and its count */
    } words_t;
    
    char *strlwr (char *str)    /* no need for unsigned char */
    {
        char *p = str;
    
        while (*p) {
            *p = tolower(*p);
            p++;
        }
    
        return str;
    }
    
    int main (void) {
    
        FILE *fptr;
        char path[PATH_MAX], word[MAX_WORD];
        size_t i, len, index = 0, max_words = MAX_WORDS;
    
        /* pointer to allocated block of max_words struct initialized zero */
        words_t *words = calloc (max_words, sizeof *words);
        if (!words) {   /* valdiate every allocation */
            perror ("calloc-words");
            exit (EXIT_FAILURE);
        }
    
        /* Input file path */
        printf ("Enter file path: ");
        if (scanf ("%s", path) != 1) {  /* validate every input */
            fputs ("error: invalid file path or cancellation.\n", stderr);
            return 1;
        }
    
        fptr = fopen (path, "r");   /* open file */
        if (fptr == NULL) {         /* validate file open */
            fputs ( "Unable to open file.\n"
                    "Please check you have read privileges.\n", stderr);
            exit (EXIT_FAILURE);
        }
    
        while (fscanf (fptr, "%s", word) == 1) {  /* while valid word read */
            int iscap = 0, isunique = 1;    /* is captial, is unique flags */
    
            if (isupper (*word))            /* is the word uppercase */
                iscap = 1;
    
            /* remove all trailing punctuation characters */
            len = strlen (word);                    /* get length */
            while (len && ispunct(word[len - 1]))   /* only if len > 0 */
                word[--len] = 0;
    
            strlwr (word);                  /* convert word to lowercase */
    
            /* check against 100 most common words (TOP100) */
            if (len < TOP_LEN) {                    /* word less than TOP_LEN? */
                char teststr[TOP_LEN * 2];          /* buffer for " word " */
                sprintf (teststr, " %s ", word);    /* create teststr */
                if (strstr (TOP100, teststr))       /* check if in TOP100 */
                    continue;                       /* if so, get next word */
            }
    
            /* check if word exits in list of all distinct words */
            for (i = 0; i < index; i++) {
                if (strcmp(words[i].word, word) == 0) {
                    isunique = 0;               /* set unique flag zero */
                    if (iscap)                  /* if capital flag set */
                        words[i].cap = iscap;   /* set capital flag in struct */
                    words[i].count++;           /* increment word count */
                    break;                      /* bail - done */
                }
            }
            if (isunique) { /* if unique, add to array, increment index */
                if (index == max_words) {       /* is realloc needed? */
                    /* always use a temporary pointer with realloc */
                    void *tmp = realloc (words, 2 * max_words * sizeof *words);
                    if (!tmp) { /* validate every allocation */
                        perror ("realloc-words");
                        break;  /* don't exit, original data still valid */
                    }
                    words = tmp;    /* assign reallocated block to words */
                    /* (optional) set all new memory to zero */
                    memset (words + max_words, 0, max_words * sizeof *words);
                    max_words *= 2; /* update max_words to reflect new limit */
                }
                memcpy (words[index].word, word, len + 1);  /* have len */
                if (iscap)                      /* if cap flag set */
                    words[index].cap = iscap;   /* set capital flag in struct */
                words[index++].count++;         /* increment count & index */
            }
        }
        fclose (fptr);  /* close file */
    
        /*
         * Print occurrences of all words in file.
         */
        puts ("\nOccurrences of all distinct words with Cap in file:");
        for (i = 0; i < index; i++) {
            if (words[i].cap) {
                strcpy (word, words[i].word);
                *word = toupper (*word);
                /*
                 * %-15s prints string in 15 character width.
                 * - is used to print string left align inside
                 * 15 character width space.
                 */
                printf("%-8d %s\n", words[i].count, word);
            }
        }
        free (words);
    
        return 0;
    }
    

    Example Use/Output

    As was the case last time, your Expected Output: (Example only) is wrong because there is nothing in your code to remove plurals, possessives or plural possessives, so your output with your cars.txt file would be:

    $ ./bin/unique_words_exclude_top_100
    Enter file path: dat/cars.txt
    
    Occurrences of all distinct words with Cap in file:
    2        Motor
    8        Cars
    1        German
    1        Karl
    2        Benz
    1        Patent-motorwagen
    1        Model
    1        T
    1        American
    1        Ford
    1        Company
    1        Western
    1        Europe
    1        Electric
    2        Road
    1        People's
    1        China
    1        India
    

    Look things over and let me know if you have further questions.

    Match regular expression (case sensitive), If there are no matches, startIndex is an empty array. Values in startIndex indicate the index of the first character of each word that matches the regular expression. Capture words within a character vector that contain the letter x . Exclude newline characters from the match using the 'dotexceptnewline' option. Given a big string and an array of small strings, all of which are smaller in length than the big string. The task is to create an array of booleans, where each boolean represents whether the small string at that index in the array of small strings is contained in the big string.

    filter out common word before adding word into words list. I made the fiter function as below:

    int isCommonWord(char * word)
    {
        int i = 0;
        for (i = 0; i < NUMBER_OF_STRING; i++) {
            if (strcmp(commonWords[i], word) == 0) return 1;
        }
        return 0;
    }
    

    And, filter out word before adding to words array. Please refer the 2nd line of the code what i modified as below:

    if (isunique) { /* if unique, add to array, increment index */
        if (!isCommonWord(word)) {
            if (index == max_words) {       /* is realloc needed? */
                /* always use a temporary pointer with realloc */
                void *tmp = realloc(words, 2 * max_words * sizeof *words);
                if (!tmp) { /* validate every allocation */
                    perror("realloc-words");
                    break;  /* don't exit, original data still valid */
                }
                words = (words_t *)tmp;    /* assign reallocated block to words */
                /* (optional) set all new memory to zero */
                memset(words + max_words, 0, max_words * sizeof *words);
                max_words *= 2; /* update max_words to reflect new limit */
            }
            memcpy(words[index].word, word, len + 1);  /* have len */
            if (iscap)                      /* if cap flag set */
                words[index].cap = iscap;   /* set capital flag in struct */
            words[index++].count++;         /* increment count & index */
        }
    }
    

    I think The result is correct as below:

    Enter file path: cars.txt
    
    Occurrences of all distinct words with Cap in file:
    2        Motor
    8        Cars
    1        German
    1        Karl
    2        Benz
    1        Patent-motorwagen
    1        Model
    1        T
    1        American
    1        Ford
    1        Company
    1        Western
    1        Europe
    1        Electric
    2        Road
    1        People's
    1        China
    1        India
    

    Most frequent word in an array of strings, Iterate through array of words. for ( int i = 0 ; i < arr.length; i++) {. // If word already exist in HashMap then increase it's count by 1. if (hs. I’ve highlighted the matching words in column A red. What we want Excel to do is to check the text string in column A to see if any of the words in our list in H1:H3 are present, if they are then return the matching word. Note: I’ve given cells H1:H3 the named range ‘list’.

    This obviously doesn't work, because it isn't skipping the word if it is a common word like in the misleading comment, but skip the current iteration and continue checking with the next word in the common words list

    // skip the word if it is a common word
    for (int i = 0; i < NUMBER_OF_STRING; i++) {
        if (strcmp(word, commonWords[i])==0) {
            continue;
        }
    }
    

    continue will only affect the innermost loop. Besides, after the loop nothing is changed

    To fix that you need to break the outer loop

    nextword:
    while (fscanf (fptr, "%s", word) == 1) // read the word
        for (int i = 0; i < NUMBER_OF_STRING; i++) {
            if (strcmp(word, commonWords[i])==0) {
                goto nextword; // skip current word
            }
        }
    /// ...
    }
    

    Or if you don't want to use goto then another variable must be used

    int isCommonWord = 0;
    while (fscanf (fptr, "%s", word) == 1) // read the word
        for (int i = 0; i < NUMBER_OF_STRING; i++) {
            if (strcmp(word, commonWords[i])==0) {
                isCommonWord = 1;
                break; // exit the for loop
            }
        }
        if (isCommonWord)
            continue;  // get the next word
    /// ...
    }
    

    Anyway your implementation is quite inefficient. This is basically a dictionary that maps from a string (the word) to integer (which is the word count). The dictionary can be sorted (like std::map in C++) or hash-based (std::unordered_map in C++). Since you don't sort the array you always have to traverse through the whole list. If the array is sorted then using binary search will cut down the lookup significantly. To check a list of 128 elements you need only at most 7 comparisons instead of 128 like in the case of unsorted list

    But before looking for the word in the dictionary you need to check if the word is common or not first. That's done by checking if the word exists in the common word set or not. Again the set can be implemented unsorted (slow), sorted (better, std::set in C++) or hash-based (fastest but needs more memory, std::unordered_set in C++). The difference between the set and the dictionary is that each dictionary entry contains a pair of (key, value), whereas the value is also the key in a set. The for loop checking strcmp(word, commonWords[i])==0 above is a simple set traversing. In any case, once you've found the word in the set, skip the current while loop and not the for loop like I said above. That'll work

    str_replace - Manual, If search is an array and replace is a string, then this replacement string is or anything else, but this is a bit of code that allows you to replace strings found in an It's interesting that these developers use str_replace (let's ignore the fact that� Given two sentences as strings A and B. The task is to return a list of all uncommon words. A word is uncommon if it appears exactly once in any one of the sentences, and does not appear in the other sentence. Note: A sentence is a string of space-separated words. Each word consists only of

    Continuing outer loops is a case where goto is recommended.

    Add a label before the while:

    outer:
    while (fscanf (fptr, "%s", word) == 1)  { ... }
    

    And change the possible solution in the question to:

    for (int i = 0; i < NUMBER_OF_STRING; i++) {
        if (strcmp(word, commonWords[i])==0) {
            goto outer;
        }
    }
    

    In your current solution, continue simply continues the inner for loop.

    EDIT


    Based on your program, modifying the program as follows should work:

    .
    .
    .
    
    outer:
    while (fscanf (fptr, "%s", word) == 1) {
        .
        .
        .
    
        strlwr(word);
    
        for (int i = 0; i < NUMBER_OF_STRING; i++) {
            if (strcmp(word, commonWords[i])==0) {
                goto outer;
            }
        }
    
        .
        .
        .
    }
    
    .
    .
    .
    

    A function for this would look like:

    int isCommon(char *word) {
        for (int i = 0; i < NUMBER_OF_STRING; i++) {
            if (strcmp(word, commonWords[i])==0) {
                return 1;
            }
        }
        return 0;
    }
    
    int main() {
        .
        .
        .
    
        while (fscanf (fptr, "%s", word) == 1) {
            .
            .
            .
    
            strlwr(word);
    
            if(isCommon(word))
                continue;
    
            .
            .
            .
        }
    
        .
        .
        .
    }
    

    Note that if using this function, you no longer need goto; a simple continue would suffice.

    preg_split - Manual, Tip. If matching fails, an array with a single element containing the input string will be returned. If you want to split by a char, but want to ignore that char in case it is escaped, use a This regular expression will split a long string of words into an array of the number of 'non-word' characters found after the last word Count words present in a string; Count words in a given string; Count of words whose i-th letter is either (i-1)-th, i-th, or (i+1)-th letter of given word; Program to find Smallest and Largest Word in a String; Count substrings with same first and last characters; Recursive solution to count substrings with same first and last characters

    Excel formula: Cell contains some words but not others, To test a cell to see if contains certain words but not others, you can use an array When no match is found, SEARCH returns the #VALUE error. For example, to test for red, blue, or green, but exclude pink and orange, you can use: The Excel SEARCH function returns the location of one text string inside another. Consider above example in which 'an' is the smallest word and 'extraordinary' is the largest word. One of the approach to find smallest and largest word is to split string into words then, compare length of each word with variables small and large. If length of a word is less than length of small then, store that word in small.

    Method #1 : Using split () Using split function, we can split the string into a list of words and is most generic and recommended method if one wished to accomplish this particular task. But drawback is that it fails in the cases in string contains punctuation marks.

    The “\\w” pattern matches a word in a regular expression. For example, “Java Example.Hello” string will return word count as 2 using the “\\s+” pattern because “Example” and “Hello” are not separated by a space character but the dot. While the “\\w+” pattern will return word count as 3 since it matches words as given below.

    Comments
    • Make an array with the top 100 words, then when you read the file word by word (as you already do) simply check if it's inside the array. in that case, continue the while.
    • You shouldn't stuff the whole functionality into main. Use functions.
    • ... and where is your attempt? There is absolutely no code that filters out the most common words. You need to show what you have tried. Stackoverflow is not a place where you can ask people to write code for you. You should know that with 665 rep.
    • You don't compare strings with == You're just comparing pointers, that way
    • I think strcmp(word, commonWord[i]) will work better.
    • Thank you for your answer, however, the solution does not fix the problem of skipping common words from being added to the list of words as per the running program.
    • Bonus: do you know how to add that to a function?
    • @jaycodez goto? see the code from my answer. It is the function.
    • continue is also not good way because programmer must always consider codes from the below of continue to the end of loop. It is not good for a maintainability even though this code works now. So, I suggest if statement.

    Hot Questions

  • Chain (compose) method calls in Groovy2592
  • Display single item from angularfire2 query6917
  • SQL Server: Examples of PIVOTing String data9930
  • Forward ref through React Router's withRouter HOC589
  • Pip error when trying to run pip command from virtualenv on macOS6037
  • select2 keyboard issue on mobile6218
  • Express.js Response Timeout6219
  • Pandas Dataframe display on a webpage4355
  • R Markdown Math Equation Alignment8465
  • Regress a matrix against a column vector in R6977
  • Git pull is very slow... Why?8999
  • How to match two element horizontally in Constraint Layout?9150
  • Python appending list, nothings added848
  • select days date in between from_date and to_date table columns6272
  • Can't `brew link` an unlinked keg8715
  • What is username and password when starting Spring Boot with Tomcat?6472
  • How can I safely change states between state objects?3750
  • Access template parameter from class object5993
  • C++ setter/getter in one function?4725
  • What is the Best way to manage state in AngularJS6075