Class: Jobs::Analysis::WordFrequency

Inherits:
Base show all
Defined in:
lib/jobs/analysis/word_frequency.rb

Overview

Produce a parallel word frequency list for a dataset

Class Method Summary (collapse)

Methods inherited from Base

add_concern, download?, job_list, view_paths

Class Method Details

+ (undefined) perform(args = { })

Export the word frequency data.

This saves its data out as a CSV file to be downloaded by the user later. As of yet, we don't offer display in the browser; I think this data is so complex that you'll want to pull it up on a spreadsheet.

Note that there are also parameters to be passed in to the ComputeWordFrequencies concern; see that concern's documentation for the specification of those arguments.

Examples:

Start a job for computing a dataset's word frequencies

Resque.enqueue(Jobs::Analysis::WordFrequency,
               user_id: current_user.to_param,
               dataset_id: dataset.to_param,
               task_id: task.to_param,
               [word frequency concern arguments])

Options Hash (args):

  • user_id (String)

    the user whose dataset we are to work on

  • dataset_id (String)

    the dataset to operate on

  • task_id (String)

    the analysis task we're working from

See Also:



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/jobs/analysis/word_frequency.rb', line 34

def self.perform(args = { })
  args.symbolize_keys!

  # Fetch the user based on ID
  user = User.find(args[:user_id])
  fail ArgumentError, 'User ID is not valid' unless user

  # Fetch the dataset based on ID
  dataset = user.datasets.find(args[:dataset_id])
  fail ArgumentError, 'Dataset ID is not valid' unless dataset

  # Update the analysis task
  task = dataset.analysis_tasks.find(args[:task_id])
  fail ArgumentError, 'Task ID is not valid' unless task

  task.name = 'Calculate word frequencies'
  task.save

  # Do the analysis
  analyzer = compute_word_frequencies(dataset, args)

  # Create some CSV
  csv_string = CSV.generate do |csv|
    csv << ["Word frequency information for dataset #{dataset.name}"]
    csv << ['']

    # Output the block data
    if analyzer.blocks.count > 1
      csv << ['Each block of document:']

      name_row = ['']
      header_row = ['']
      word_rows = []
      analyzer.word_list.each do |w|
        word_rows << [w]
      end
      types_row = ['Number of types']
      tokens_row = ['Number of tokens']
      ttr_row = ['Type/token ratio']

      analyzer.blocks.each_with_index do |b, i|
        s = analyzer.block_stats[i]

        name_row << s[:name] << '' << '' << ''
        header_row << 'Frequency' \
                   << 'Proportion' \
                   << 'TF/IDF (vs. dataset)' \
                   << 'TF/IDF (vs. corpus)'

        word_rows.each do |r|
          word = r[0]
          r << b[word].to_s
          r << (b[word].to_f / s[:tokens].to_f).to_s

          r << Math.tfidf(b[word].to_f / s[:tokens].to_f,
                          analyzer.df_in_dataset[word],
                          dataset.entries.count)
          r << Math.tfidf(b[word].to_f / s[:tokens].to_f,
                          analyzer.df_in_corpus[word],
                          analyzer.num_corpus_documents)
        end

        # Output the block stats at the end
        types_row << s[:types].to_s << '' << '' << ''
        tokens_row << s[:tokens].to_s << '' << '' << ''
        ttr_row << (s[:types].to_f / s[:tokens].to_f).to_s << '' << '' << ''
      end

      csv << name_row
      csv << header_row
      word_rows.each do |r|
        csv << r
      end
      csv << types_row
      csv << tokens_row
      csv << ttr_row
    end

    # Output the dataset data
    csv << ['']
    csv << ['For the entire dataset:']
    csv << ['', 'Frequency', 'Proportion', 'DF (in corpus)', 'TF/IDF (dataset vs. corpus)']
    analyzer.word_list.each do |w|
      tf_in_dataset = analyzer.tf_in_dataset[w]
      csv << [w,
              tf_in_dataset.to_s,
              (tf_in_dataset.to_f / analyzer.num_dataset_tokens.to_f).to_s,
              analyzer.df_in_corpus[w].to_s,
              Math.tfidf(tf_in_dataset, analyzer.df_in_corpus[w], analyzer.num_corpus_documents)]
    end
    csv << ['Number of types', analyzer.num_dataset_types.to_s]
    csv << ['Number of tokens', analyzer.num_dataset_tokens.to_s]
    csv << ['Type/token ratio', (analyzer.num_dataset_types.to_f / analyzer.num_dataset_tokens.to_f).to_s]
    csv << ['']
  end

  # Write it out
  ios = StringIO.new
  ios.write(csv_string)
  ios.original_filename = 'word_frequency.csv'
  ios.content_type = 'text/csv'
  ios.rewind

  task.result = ios
  ios.close

  # We're done here
  task.finish!
end