1616LETTERS_PATH = MODEL_DIR / "norvig_letter_ngrams.csv"
1717WORD_LENGTHS_PATH = MODEL_DIR / "norvig_word_length_frequencies.csv"
1818
19+
20+
1921# Shannon's English lagnuage generator using letter frequency
2022
2123# Relative Frequencies of Letters in General English Plain text From Cryptographical Mathematics, by Robert Edward Lewand
@@ -54,30 +56,33 @@ def sample_ngram(lookups, n, prefix="", k=1):
5456 return random .choices (data ["keys" ], weights = data ["freqs" ], k = k )
5557
5658
57- def generate_word (n ) -> str : # n is the number of letters in the word
59+ def generate_word (N , n ) -> str : # N = max letters, n = context window (as in, n-gram)
5860 lookups = NGRAM_LOOKUPS
59- n_max = n
61+ N_max = N
6062 samples = {}
6163 samples [1 ] = sample_ngram (lookups , n = 1 , prefix = "" , k = 1 )[0 ]
6264 print ("1-gram:" , samples [1 ]) if printing == 1 else None
63- for i in range (2 , n + 1 ):
64- if len (lookups )<= i : # no i-grams available → stop
65- samples [i ] = samples [i - 1 ]+ '#'
66- n_max = i
65+ for i in range (2 , N + 1 ):
66+ if len (lookups )<= min ( n , i ) :
67+ samples [i ] = samples [i - 1 ]+ '#' # ## no i-grams available → stop
68+ N_max = i
6769 break
68- prefix = samples [i - 1 ] # previous (i-1)-gram is the prefix
69- if prefix not in lookups [i ]: # missing bucket → stop
70+ prefix = samples [i - 1 ][ - n + 1 :] # previous (i-1)-gram, last n letters
71+ if prefix not in lookups [len ( prefix ) + 1 ]: # $$ missing bucket → stop
7072 if i > 2 :
71- samples [i ] = samples [i - 1 ]+ "#"
72- n_max = i
73+ samples [i ] = samples [i - 1 ]+ "$"
74+ N_max = i
7375 else :
74- samples [i ] = "#"
76+ samples [i ] = "$"
77+ N_max = 1
7578 break
7679 else :
77- samples [i ] = sample_ngram (lookups , n = i , prefix = prefix , k = 1 )[0 ]
80+ new = sample_ngram (lookups , n = min (i ,n ), prefix = prefix , k = 1 )[0 ]
81+ print (f"i = { i } , N = { N } , n = { n } ,new string = { new } " ) if printing == 1 else None
82+ samples [i ] = samples [i - 1 ][:- n + 1 ]+ new
7883 print (f"{ i } -gram:" , samples [i ]) if printing == 1 else None
7984
80- return samples [n_max ]
85+ return samples [N_max ]
8186
8287def csv_to_lists (filename : str ) -> list :
8388 frequencies = []
@@ -95,11 +100,14 @@ def run(response, answer, params:Params) -> Result:
95100 word_lengths ["tokens" ] = [row [0 ] for row in data ]
96101 word_lengths ["weights" ] = [row [1 ] for row in data ]
97102 word_count = params .get ("word_count" , 10 )
103+ response_used = isinstance (response , int )
104+ context_window = response if response_used else params .get ("context_window" , 3 )
98105 if word_count == "random" :
99106 word_count = random .randint (3 ,15 )
100107 for i in range (word_count ):
101108 k = int (random .choices (word_lengths ["tokens" ],weights = word_lengths ["weights" ],k = 1 )[0 ])
102- output .append (generate_word (k ))
103- output = ' ' .join (output )
109+ output .append (generate_word (k ,context_window ))
110+ feedback_items = [("general" , ' ' .join (output ))]
111+ feedback_items .append ("| Answer not an integer; used default context window" ) if not response_used else None
104112 is_correct = True
105- return Result (is_correct = is_correct ,feedback_items = [( "general" , output )] )
113+ return Result (is_correct = is_correct ,feedback_items = feedback_items )
0 commit comments