README.md - time-to-botec - Benchmark sampling in different programming languages

README.md (4346B)
      1 <!--
      2 
      3 @license Apache-2.0
      4 
      5 Copyright (c) 2018 The Stdlib Authors.
      6 
      7 Licensed under the Apache License, Version 2.0 (the "License");
      8 you may not use this file except in compliance with the License.
      9 You may obtain a copy of the License at
     10 
     11    http://www.apache.org/licenses/LICENSE-2.0
     12 
     13 Unless required by applicable law or agreed to in writing, software
     14 distributed under the License is distributed on an "AS IS" BASIS,
     15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     16 See the License for the specific language governing permissions and
     17 limitations under the License.
     18 
     19 -->
     20 
     21 # LDA
     22 
     23 > [Latent Dirichlet Allocation][lda] via collapsed Gibbs sampling.
     24 
     25 <section class="intro">
     26 
     27 </section>
     28 
     29 <!-- /.intro -->
     30 
     31 <section class="usage">
     32 
     33 ## Usage
     34 
     35 ```javascript
     36 var lda = require( '@stdlib/nlp/lda' );
     37 ```
     38 
     39 #### lda( docs, K\[, options] )
     40 
     41 [Latent Dirichlet Allocation][lda] via collapsed Gibbs sampling. To create a model, call the `lda` function by passing it an `array` of `strings` and the number of topics `K` that should be identified.
     42 
     43 ```javascript
     44 var model;
     45 var docs;
     46 
     47 docs = [
     48     'I loved you first',
     49     'For one is both and both are one in love',
     50     'You never see my pain',
     51     'My love is such that rivers cannot quench',
     52     'See a lot of pain, a lot of tears'
     53 ];
     54 
     55 model = lda( docs, 2 );
     56 // returns {}
     57 ```
     58 
     59 After initialization, model parameters are estimated by calling the `.fit()` method, which performs collapsed Gibbs sampling.
     60 
     61 The model object contains the following methods:
     62 
     63 #### model.fit( iter, burnin, thin )
     64 
     65 <!-- run-disable -->
     66 
     67 ```javascript
     68 model.fit( 1000, 100, 10 );
     69 ```
     70 
     71 The `iter` parameter denotes the number of sampling iterations. While a common choice, one thousand iterations might not always be appropriate. Empirical diagnostics can be used to assess whether the constructed Markov Chain has converged. `burnin` denotes the number of estimates that are thrown away at the beginning, whereas `thin` controls the number of estimates discarded in-between iterations.
     72 
     73 #### model.getTerms( k\[, no = 10] )
     74 
     75 Returns the `no` terms with the highest probabilities for chosen topic `k`.
     76 
     77 <!-- run-disable -->
     78 
     79 ```javascript
     80 var words = model.getTerms( 0, 3 );
     81 /* returns
     82     [
     83         { 'word': 'both', 'prob': 0.06315008476532499 },
     84         { 'word': 'pain', 'prob': 0.05515729517235543 },
     85         { 'word': 'one', 'prob': 0.05486669737616135 }
     86     ]
     87 */
     88 ```
     89 
     90 </section>
     91 
     92 <!-- /.usage -->
     93 
     94 <section class="examples">
     95 
     96 ## Examples
     97 
     98 <!-- eslint no-undef: "error" -->
     99 
    100 ```javascript
    101 var sotu = require( '@stdlib/datasets/sotu' );
    102 var roundn = require( '@stdlib/math/base/special/roundn' );
    103 var stopwords = require( '@stdlib/datasets/stopwords-en' );
    104 var lowercase = require( '@stdlib/string/lowercase' );
    105 var lda = require( '@stdlib/nlp/lda' );
    106 
    107 var speeches;
    108 var words;
    109 var terms;
    110 var model;
    111 var str;
    112 var i;
    113 var j;
    114 
    115 words = stopwords();
    116 for ( i = 0; i < words.length; i++ ) {
    117     words[ i ] = new RegExp( '\\b'+words[ i ]+'\\b', 'gi' );
    118 }
    119 
    120 speeches = sotu({
    121     'range': [ 1930, 2010 ]
    122 });
    123 for ( i = 0; i < speeches.length; i++ ) {
    124     str = lowercase( speeches[ i ].text );
    125     for ( j = 0; j < words.length; j++ ) {
    126         str = str.replace( words[ j ], '' );
    127     }
    128     speeches[ i ] = str;
    129 }
    130 
    131 model = lda( speeches, 3 );
    132 
    133 model.fit( 1000, 100, 10 );
    134 
    135 for ( i = 0; i <= 80; i++ ) {
    136     str = 'Year: ' + (1930+i) + '\t';
    137     str += 'Topic 1: ' + roundn( model.avgTheta.get( i, 0 ), -3 ) + '\t';
    138     str += 'Topic 2: ' + roundn( model.avgTheta.get( i, 1 ), -3 ) + '\t';
    139     str += 'Topic 3: ' + roundn( model.avgTheta.get( i, 2 ), -3 );
    140     console.log( str );
    141 }
    142 
    143 terms = model.getTerms( 0, 20 );
    144 for ( i = 0; i < terms.length; i++ ) {
    145     terms[ i ] = terms[ i ].word;
    146 }
    147 console.log( 'Words most associated with first topic:\n ' + terms.join( ', ' ) );
    148 
    149 terms = model.getTerms( 1, 20 );
    150 for ( i = 0; i < terms.length; i++ ) {
    151     terms[ i ] = terms[ i ].word;
    152 }
    153 console.log( 'Words most associated with second topic:\n ' + terms.join( ', ' ) );
    154 
    155 terms = model.getTerms( 2, 20 );
    156 for ( i = 0; i < terms.length; i++ ) {
    157     terms[ i ] = terms[ i ].word;
    158 }
    159 console.log( 'Words most associated with third topic:\n ' + terms.join( ', ' ) );
    160 ```
    161 
    162 </section>
    163 
    164 <!-- /.examples -->
    165 
    166 <section class="links">
    167 
    168 [lda]: https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
    169 
    170 </section>
    171 
    172 <!-- /.links -->
	time-to-botec Benchmark sampling in different programming languages
	Log \| Files \| Refs \| README