You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
635 lines
21 KiB
C++
635 lines
21 KiB
C++
/*=========================================================================
|
|
|
|
Program: Visualization Toolkit
|
|
Module: vtkWordCloud.h
|
|
|
|
Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen
|
|
All rights reserved.
|
|
See Copyright.txt or http://www.kitware.com/Copyright.htm for details.
|
|
|
|
This software is distributed WITHOUT ANY WARRANTY; without even
|
|
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
PURPOSE. See the above copyright notice for more information.
|
|
|
|
=========================================================================*/
|
|
#ifndef vtkWordCloud_h
|
|
#define vtkWordCloud_h
|
|
|
|
#include "vtkImageAlgorithm.h"
|
|
#include "vtkImageData.h" // For ImageData
|
|
#include "vtkInfovisCoreModule.h" // For export macro
|
|
#include "vtkSmartPointer.h" // For SmartPointer
|
|
|
|
#include <array> // For stl array
|
|
#include <functional> // for function
|
|
#include <set> // for stl multiset
|
|
#include <string> // For stl string
|
|
#include <vector> // For stl vector
|
|
|
|
/**
|
|
* @class vtkWordCloud
|
|
* @brief generate a word cloud visualization of a text document
|
|
*
|
|
* Word Clouds, AKA Tag Clouds
|
|
* (https://en.wikipedia.org/wiki/Tag_cloud), are a text visualization
|
|
* technique that displays individual words with properties that
|
|
* depend on the frequency of a word in a document. vtkWordCloud
|
|
* varies the font size based on word frequency. Word Clouds are useful
|
|
* for quickly perceiving the most prominent terms in a document.
|
|
* Also, Word Clouds can identify trends and patterns that would
|
|
* otherwise be unclear or difficult to see in a tabular
|
|
* format. Frequently used keywords stand out better in a word
|
|
* cloud. Common words that might be overlooked in tabular form are
|
|
* highlighted in the larger text, making them pop out when displayed
|
|
* in a word cloud.
|
|
*
|
|
* There is some controversy about the usefulness of word
|
|
* clouds. Their best use may be for presentations, see
|
|
* https://tinyurl.com/y59hy7oa
|
|
*
|
|
* The generation of the word cloud proceeds as follows:
|
|
* 1. Read the text file
|
|
* 2. Split text into words to be processed
|
|
* Extract words from the text
|
|
* Drop the case of each word for filtering
|
|
* Filter the words
|
|
* Replace words from the ReplacementPairs list
|
|
* Skip the word if it is in the stop list or contains a digit
|
|
* Skip single character words
|
|
* Raise the case of the first letter in each word
|
|
* Sort the word list by frequency
|
|
* 3. Create a rectangular mask image or read a mask image
|
|
* 4. For each word
|
|
* Render the word into an image
|
|
* Try to add the word to the word cloud image.
|
|
* For each orientation, see if the word "fits"
|
|
* If no fit, move along a path to try another location
|
|
*
|
|
* NOTE: A word fits if all of the non-zero word cloud pixels in the
|
|
* extent of the text image are background pixels.
|
|
*
|
|
* NOTE: The path is an Archimedean Spiral
|
|
* (https://en.wikipedia.org/wiki/Archimedean_spiral)
|
|
|
|
* NOTE: vtkWordCloud has a built-in list of stop word. Stop words are
|
|
* words that are filtered out before processing of the text, such as
|
|
* the, is, at, which, and so on.
|
|
*
|
|
* NOTE: Color names are defined in vtkNamedColors. A visual
|
|
* representation of color names is here: https://tinyurl.com/y3yxcxj6
|
|
*
|
|
* NOTE: vtkWordCloud offers Several methods to customize the resulting
|
|
* visualization. The class provides defaults that provide a reasonable
|
|
* result.
|
|
*
|
|
* BackgroundColorName - The vtkNamedColors name for the background
|
|
* (MidNightBlue). See https://tinyurl.com/y3yxcxj6 for a visual
|
|
* representation of the named colors.
|
|
*
|
|
* ColorDistribution - Distribution of random colors(.6 1.0), if
|
|
* WordColorName is empty.
|
|
*
|
|
* ColorSchemeName - Name of a color scheme from vtkColorSeries to be
|
|
* used to select colors for the words (), if WordColorName is empty.
|
|
* See https://tinyurl.com/y3j6c27o for a visual representation of the
|
|
* color schemes.
|
|
*
|
|
* DPI - Dots per inch(200) of the rendered text. DPI is used as a
|
|
* scaling mechanism for the words. As DPI increases, the word size
|
|
* increases. If there are too, few skipped words, increase this value,
|
|
* too many, decrease it.
|
|
*
|
|
* FontFileName - If empty, the built-in Arial font is used(). The
|
|
* FontFileName is the name of a file that contains a TrueType font.
|
|
* https://www.1001freefonts.com/ is a good source for free TrueType
|
|
* fonts.
|
|
*
|
|
* FontMultiplier - Font multiplier(6). The final FontSize is this value
|
|
* times the word frequency.
|
|
*
|
|
* Gap - Space gap of words (2). The gap is the number of spaces added to
|
|
* the beginning and end of each word.
|
|
*
|
|
* MaskColorName - Name of the color for the mask (black). This is the
|
|
* name of the vtknamedColors that defines the foreground of the
|
|
* mask. Usually black or white. See https://tinyurl.com/y3yxcxj6 for
|
|
* a visual representation of the named colors.
|
|
*
|
|
* MaskFileName - Mask file name(). If a mask file is specified, it will be
|
|
* used as the mask. Otherwise, a black square is used as the mask. The
|
|
* mask file should contain three channels of unsigned char values. If
|
|
* the mask file is just a single unsigned char, specify turn the boolean
|
|
* BWMask on. If BWmask is on, the class will create a three channel
|
|
* image using vtkImageAppendComponents.
|
|
*
|
|
* BWMask - Mask image has a single channel(false). Mask images typically
|
|
* have three channels (r,g,b).
|
|
*
|
|
* MaxFontSize - Maximum font size(48).
|
|
*
|
|
* MinFontSize - Minimum font size(8).
|
|
*
|
|
* MinFrequency - Minimum word frequency accepted(2). Word with
|
|
* frequencies less than this will be ignored.
|
|
*
|
|
* OffsetDistribution - Range of uniform random offsets(-size[0]/100.0
|
|
* -size{1]/100.0)(-20 20). These offsets are offsets from the generated
|
|
* path for word layout.
|
|
*
|
|
* OrientationDistribution - Ranges of random orientations(-20 20). If
|
|
* discrete orientations are not defined, these orientations will be
|
|
* generated.
|
|
*
|
|
* Orientations - Vector of discrete orientations(). If non-empty,
|
|
* these will be used instead of the orientations distribution");
|
|
*
|
|
* ReplacementPairs - Replace the first word with another second word
|
|
* (). The each word will also added to the StopList. The second
|
|
* argument can contain multiple words. For example you could replace
|
|
* "bill" with "Bill Lorensen" or, "vtk" with "VTK . Remember that
|
|
* words are always stored internally with lower case, even though the
|
|
* first letter is capitalized in the Word Cloud.
|
|
*
|
|
* Sizes - Size of image(640 480).
|
|
*
|
|
* StopWords - User provided stop words(). Stop words are words that
|
|
* are filtered out before processing of the text, such as the, is,
|
|
* at, which, and so on. vtkWordClass has built-in stop words. The
|
|
* user-provided stop words are added to the built-in list. See
|
|
* https://en.wikipedia.org/wiki/Stop_words for a description. The
|
|
* built-in stop words were derived from the english stop words at
|
|
* https://www.ranks.nl/stopwords. Stop words for other languages are
|
|
* also available.
|
|
*
|
|
* StopListFileName - the name of a file that contains stop words,
|
|
* one word per line (). If present, the stop words in the file
|
|
* replace the built-in stop list.
|
|
*
|
|
* Title - Add this word to the document's words and set a high
|
|
* frequency, so that is will be rendered first.
|
|
*
|
|
* WordColorName - Name of the color for the words(). The name is
|
|
* selected from vtkNamedColors. If the name is empty, the
|
|
* ColorDistribution will generate random colors. See
|
|
* https://tinyurl.com/y3yxcxj6 for a visual representation of the
|
|
* named colors.
|
|
*
|
|
* The class also provided Get methods that return vectors
|
|
* StopWords, SkippedWords and KeptWords.
|
|
*/
|
|
|
|
class VTKINFOVISCORE_EXPORT vtkWordCloud : public vtkImageAlgorithm
|
|
{
|
|
public:
|
|
vtkTypeMacro(vtkWordCloud, vtkImageAlgorithm);
|
|
void PrintSelf(ostream& os, vtkIndent indent) override;
|
|
|
|
/**
|
|
* Construct object with vertex cell generation turned off.
|
|
*/
|
|
static vtkWordCloud* New();
|
|
|
|
// Typedefs
|
|
using ColorDistributionContainer = std::array<double, 2>;
|
|
using OffsetDistributionContainer = std::array<int, 2>;
|
|
using OrientationDistributionContainer = std::array<double, 2>;
|
|
using OrientationsContainer = std::vector<double>;
|
|
using PairType = std::tuple<std::string, std::string>;
|
|
using ReplacementPairsContainer = std::vector<PairType>;
|
|
using SizesContainer = std::array<int, 2>;
|
|
using StopWordsContainer = std::set<std::string>;
|
|
using StringContainer = std::vector<std::string>;
|
|
|
|
//@{
|
|
/**
|
|
* Return the AdjustedSizes of the resized mask file.
|
|
*/
|
|
//@}
|
|
virtual SizesContainer GetAdjustedSizes() { return AdjustedSizes; }
|
|
|
|
#define SetStdContainerMacro(name, container) \
|
|
virtual void Set##name(container arg) \
|
|
{ \
|
|
bool changed = false; \
|
|
if (arg.size() != name.size()) \
|
|
{ \
|
|
changed = true; \
|
|
} \
|
|
else \
|
|
{ \
|
|
auto a = arg.begin(); \
|
|
for (auto r : name) \
|
|
{ \
|
|
if (*a != r) \
|
|
{ \
|
|
changed = true; \
|
|
} \
|
|
a++; \
|
|
} \
|
|
} \
|
|
if (changed) \
|
|
{ \
|
|
name = arg; \
|
|
this->Modified(); \
|
|
} \
|
|
}
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get the vtkNamedColors name for the background(MidNightBlue).
|
|
*/
|
|
//@}
|
|
virtual void SetBackgroundColorName(std::string arg)
|
|
{
|
|
if (arg != BackgroundColorName)
|
|
{
|
|
this->Modified();
|
|
BackgroundColorName = arg;
|
|
}
|
|
}
|
|
virtual std::string GetBackgroundColorName() { return BackgroundColorName; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get boolean that indicates the mask image is a single
|
|
* channel(false).
|
|
*/
|
|
//@}
|
|
virtual void SetBWMask(bool arg)
|
|
{
|
|
if (BWMask != arg)
|
|
{
|
|
this->Modified();
|
|
BWMask = arg;
|
|
}
|
|
}
|
|
virtual bool GetBWMask() { return BWMask; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get ColorSchemeName, the name of a color scheme from
|
|
* vtkColorScheme to be used to select colors for the words (), if
|
|
* WordColorName is empty. See https://tinyurl.com/y3j6c27o for a
|
|
* visual representation of the color schemes.
|
|
*/
|
|
//@}
|
|
virtual void SetColorSchemeName(std::string arg)
|
|
{
|
|
if (ColorSchemeName != arg)
|
|
{
|
|
this->Modified();
|
|
ColorSchemeName = arg;
|
|
}
|
|
}
|
|
virtual std::string GetColorSchemeName() { return ColorSchemeName; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/GetDPI - Dots per inch(200) of the rendered text. DPI is
|
|
* used as a scaling mechanism for the words. As DPI increases,
|
|
* the word size increases. If there are too, few skipped words,
|
|
* increase this value, too many, decrease it.
|
|
*/
|
|
//@}
|
|
vtkSetMacro(DPI, int);
|
|
vtkGetMacro(DPI, int);
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get FileName, the name of the file that contains the text to
|
|
* be processed.
|
|
*/
|
|
//@}
|
|
virtual void SetFileName(std::string arg)
|
|
{
|
|
if (FileName != arg)
|
|
{
|
|
this->Modified();
|
|
FileName = arg;
|
|
}
|
|
}
|
|
virtual std::string GetFileName() { return FileName; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get FontFileName, If empty, the built-in Arial font is
|
|
* used(). The FontFileName is the name of a file that contains a
|
|
* TrueType font.
|
|
*/
|
|
//@}
|
|
virtual void SetFontFileName(std::string arg)
|
|
{
|
|
if (FontFileName != arg)
|
|
{
|
|
this->Modified();
|
|
FontFileName = arg;
|
|
}
|
|
}
|
|
virtual std::string GetFontFileName() { return FontFileName; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get Gap, the space gap of words (2). The gap is the number
|
|
* of spaces added to the beginning and end of each word.
|
|
*/
|
|
//@}
|
|
vtkSetMacro(Gap, int);
|
|
vtkGetMacro(Gap, int);
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get MaskColorName, the name of the color for the mask
|
|
* (black). This is the name of the vtkNamedColors that defines
|
|
* the foreground of the mask. Usually black or white.
|
|
*/
|
|
//@}
|
|
virtual void SetMaskColorName(std::string arg)
|
|
{
|
|
if (MaskColorName != arg)
|
|
{
|
|
this->Modified();
|
|
MaskColorName = arg;
|
|
}
|
|
}
|
|
virtual std::string GetMaskColorName() { return MaskColorName; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get MaskFileName, the mask file name(). If a mask file is
|
|
* specified, it will be used as the mask. Otherwise, a black
|
|
* square is used as the mask. The mask file should contain three
|
|
* channels of unsigned char values. If the mask file is just a
|
|
* single unsigned char, specify turn the boolean BWMask on. If
|
|
* BWmask is on, the class will create a three channel image using
|
|
* vtkImageAppendComponents.
|
|
*/
|
|
//@}
|
|
virtual void SetMaskFileName(std::string arg)
|
|
{
|
|
if (MaskFileName != arg)
|
|
{
|
|
this->Modified();
|
|
MaskFileName = arg;
|
|
}
|
|
}
|
|
virtual std::string GetMaskFileName() { return MaskFileName; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get MaxFontSize, the maximum font size(48).
|
|
*/
|
|
//@}
|
|
vtkSetMacro(MaxFontSize, int);
|
|
vtkGetMacro(MaxFontSize, int);
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get MinFontSize, the minimum font size(8).
|
|
*/
|
|
//@}
|
|
vtkSetMacro(MinFontSize, int);
|
|
vtkGetMacro(MinFontSize, int);
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get MinFrequency, the minimum word frequency
|
|
* accepted(2). Words with frequencies less than this will be
|
|
* ignored.
|
|
*/
|
|
//@}
|
|
vtkSetMacro(MinFrequency, int);
|
|
vtkGetMacro(MinFrequency, int);
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get FontMultiplier, the font multiplier(6). The final
|
|
* FontSize is this value the word frequency.
|
|
*/
|
|
//@}
|
|
vtkSetMacro(FontMultiplier, int);
|
|
vtkGetMacro(FontMultiplier, int);
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get ColorDistribution, the distribution of random colors(.6
|
|
* 1.0), if WordColorName is empty.
|
|
*/
|
|
//@}
|
|
SetStdContainerMacro(ColorDistribution, ColorDistributionContainer);
|
|
virtual ColorDistributionContainer GetColorDistribution() { return ColorDistribution; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get OffsetDistribution, the range of uniform random
|
|
* offsets(-size[0]/100.0 -size{1]/100.0)(-20 20). These offsets
|
|
* are offsets from the generated path for word layout.
|
|
*/
|
|
//@}
|
|
SetStdContainerMacro(OffsetDistribution, OffsetDistributionContainer);
|
|
virtual OffsetDistributionContainer GetOffsetDistribution() { return OffsetDistribution; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get OrientationDistribution, ranges of random
|
|
* orientations(-20 20). If discrete orientations are not defined,
|
|
* these orientations will be generated.
|
|
*/
|
|
//@}
|
|
SetStdContainerMacro(OrientationDistribution, OrientationDistributionContainer);
|
|
virtual OrientationDistributionContainer GetOrientationDistribution()
|
|
{
|
|
return OrientationDistribution;
|
|
}
|
|
|
|
//@{
|
|
/**
|
|
* Set/Add/Get Orientations, a vector of discrete orientations (). If
|
|
* non-empty, these will be used instead of the orientations
|
|
* distribution").
|
|
*/
|
|
//@}
|
|
SetStdContainerMacro(Orientations, OrientationsContainer);
|
|
void AddOrientation(double arg)
|
|
{
|
|
Orientations.push_back(arg);
|
|
this->Modified();
|
|
}
|
|
virtual OrientationsContainer GetOrientations() { return Orientations; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Add/Get ReplacementPairs, a vector of words that replace the
|
|
* first word with another second word (). The first word is also
|
|
* added to the StopList.
|
|
*/
|
|
//@}
|
|
SetStdContainerMacro(ReplacementPairs, ReplacementPairsContainer);
|
|
void AddReplacementPair(PairType arg)
|
|
{
|
|
ReplacementPairs.push_back(arg);
|
|
this->Modified();
|
|
}
|
|
|
|
virtual ReplacementPairsContainer GetReplacementPairs() { return ReplacementPairs; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get Sizes, the size of the output image(640 480).
|
|
*/
|
|
//@}
|
|
SetStdContainerMacro(Sizes, SizesContainer);
|
|
virtual SizesContainer GetSizes() { return Sizes; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Add/Get StopWords, a set of user provided stop
|
|
* words(). vtkWordClass has built-in stop words. The user-provided
|
|
* stop words are added to the built-in list.
|
|
*/
|
|
//@}
|
|
SetStdContainerMacro(StopWords, StopWordsContainer);
|
|
void AddStopWord(std::string word)
|
|
{
|
|
StopWords.insert(word);
|
|
this->Modified();
|
|
}
|
|
void ClearStopWords()
|
|
{
|
|
StopWords.clear();
|
|
this->Modified();
|
|
}
|
|
virtual StopWordsContainer GetStopWords() { return StopWords; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get StopListFileName, the name of the file that contains the
|
|
* stop words, one per line.
|
|
*/
|
|
//@}
|
|
virtual void SetStopListFileName(std::string arg)
|
|
{
|
|
if (StopListFileName != arg)
|
|
{
|
|
this->Modified();
|
|
StopListFileName = arg;
|
|
}
|
|
}
|
|
virtual std::string GetStopListFileName() { return StopListFileName; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get Title, add this word to the document's words and set a
|
|
* high frequency, so that is will be rendered first.
|
|
*/
|
|
//@}
|
|
virtual void SetTitle(std::string arg)
|
|
{
|
|
if (Title != arg)
|
|
{
|
|
this->Modified();
|
|
Title = arg;
|
|
}
|
|
}
|
|
virtual std::string GetTitle() { return Title; }
|
|
|
|
//@{
|
|
/**
|
|
* Set/Get WordColorName, the name of the color for the
|
|
* words(). The name is selected from vtkNamedColors. If the name
|
|
* is empty, the ColorDistribution will generate random colors.
|
|
*/
|
|
//@}
|
|
virtual void SetWordColorName(std::string arg)
|
|
{
|
|
if (WordColorName != arg)
|
|
{
|
|
this->Modified();
|
|
WordColorName = arg;
|
|
}
|
|
}
|
|
virtual std::string GetWordColorName() { return WordColorName; }
|
|
//@{
|
|
/**
|
|
* Get a vector of words that are kept in the final image.
|
|
*/
|
|
//@}
|
|
virtual std::vector<std::string>& GetKeptWords() { return KeptWords; }
|
|
|
|
//@{
|
|
/**
|
|
* Get a vector of words that are skipped. Skipped wors do not fit
|
|
* in the final image.
|
|
*/
|
|
//@}
|
|
virtual std::vector<std::string>& GetSkippedWords() { return SkippedWords; }
|
|
|
|
//@{
|
|
/**
|
|
* Get a vector of words that were stopped in the final image.
|
|
*/
|
|
//@}
|
|
virtual std::vector<std::string>& GetStoppedWords() { return StoppedWords; }
|
|
|
|
protected:
|
|
vtkWordCloud();
|
|
~vtkWordCloud() override {}
|
|
|
|
int RequestInformation(vtkInformation*, vtkInformationVector**, vtkInformationVector*) override;
|
|
|
|
int RequestData(vtkInformation*, vtkInformationVector**, vtkInformationVector*) override;
|
|
|
|
vtkSmartPointer<vtkImageData> ImageData;
|
|
int WholeExtent[6];
|
|
|
|
SizesContainer AdjustedSizes;
|
|
std::string BackgroundColorName;
|
|
bool BWMask;
|
|
ColorDistributionContainer ColorDistribution;
|
|
std::string ColorSchemeName;
|
|
int DPI;
|
|
std::string FileName;
|
|
std::string FontFileName;
|
|
int FontMultiplier;
|
|
int Gap;
|
|
std::string MaskColorName;
|
|
std::string MaskFileName;
|
|
int MaxFontSize;
|
|
int MinFontSize;
|
|
int MinFrequency;
|
|
OffsetDistributionContainer OffsetDistribution;
|
|
OrientationDistributionContainer OrientationDistribution;
|
|
OrientationsContainer Orientations;
|
|
ReplacementPairsContainer ReplacementPairs;
|
|
SizesContainer Sizes;
|
|
StopWordsContainer StopWords;
|
|
std::string StopListFileName;
|
|
std::string Title;
|
|
std::string WordColorName;
|
|
|
|
std::vector<std::string> KeptWords;
|
|
std::vector<std::string> SkippedWords;
|
|
std::vector<std::string> StoppedWords;
|
|
|
|
private:
|
|
vtkWordCloud(const vtkWordCloud&) = delete;
|
|
void operator=(const vtkWordCloud&) = delete;
|
|
|
|
// Declaring the type of Predicate that accepts 2 pairs and returns a bool
|
|
typedef std::function<bool(std::pair<std::string, int>, std::pair<std::string, int>)> Comparator;
|
|
|
|
std::multiset<std::pair<std::string, int>, Comparator> FindWordsSortedByFrequency(
|
|
std::string&, vtkWordCloud*);
|
|
struct ExtentOffset
|
|
{
|
|
ExtentOffset(int _x = 0.0, int _y = 0.0)
|
|
: x(_x)
|
|
, y(_y)
|
|
{
|
|
}
|
|
int x, y;
|
|
};
|
|
};
|
|
#endif
|
|
|
|
// LocalWords: vtkNamedColors SetMaskColorName
|