How to create a wordcloud from a TDM in python?


#1

I am trying to create wordclouds to the result of a CountVectorizer which is a term document matrix, can’t find any package that does the same. Any help will be appreciated.


How can I create word cloud in Python?
#2

Install wordcloud package using “pip install wordcloud”. Then u can use it.

# import library
import pandas as pd 
import re
from wordcloud import WordCloud

# Load spreadsheet
xl = pd.ExcelFile(file)
# Load a sheet into a DataFrame by name: df1
df1 = xl.parse('Sheet1')

data_pos = df1[df1["class"] == 'Positive']
data_pos = len(data_pos['text'])

# Data preparation
def wordcloud_draw(data, color = "black"):
    data_words = ' '.join(data)
    #pos_paragraphs = [p for p in data_words.split('\n') if p]    
    regex = re.compile('^[a-zA-Z]*')
    cleaned_words_tokens = []
    
    for word in data_words.split():
        found = False
        for match in regex.finditer(word):
            cleaned_words_tokens.append(match.group())
            found = True

    cleaned_words_tokens = filter(None, cleaned_words_tokens)
    cleaned_words_tokens = str([token.encode('UTF8') for token in cleaned_words_tokens])
    
    #cleaned_words_tokens = str(filter(None, cleaned_words_tokens))
    wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color=color,
                      width=2500,
                      height=2000
                     ).generate(cleaned_words_tokens)
    plt.figure(1,figsize=(13, 13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

# Positive sentiment
print("Positive words")
wordcloud_draw((data_pos),'white')

Find files here data.zip (638.4 KB)