import json import pickle import time from collections import Counter def create_anagram_map(dictionary_path='words.txt'): """ Reads a dictionary file and creates a map of anagrams. words.txt from https://apiacoa.org/publications/teaching/datasets/google-10000-english.txt The map uses a sorted string of characters as a canonical key and a list of corresponding words from the dictionary as the value. Args: dictionary_path (str): The path to the dictionary file. Returns: A dictionary mapping sorted character strings to lists of words. """ anagram_map = {} print(f"Starting preprocessing of '{dictionary_path}'...") try: with open(dictionary_path, 'r', encoding='utf-8') as f: for line in f: # Clean the word: remove whitespace and convert to lowercase cleaned_word = line.strip().lower() if cleaned_word.startswith("#!"): continue if cleaned_word.isalpha(): # Ensure we only process alphabetic words # The sorted string is the canonical key sorted_key = "".join(sorted(cleaned_word)) if sorted_key in anagram_map: anagram_map[sorted_key].append(cleaned_word) else: anagram_map[sorted_key] = [cleaned_word] except FileNotFoundError: print(f"Error: Dictionary file not found at '{dictionary_path}'.") print("Please download a dictionary file (e.g., from GitHub) and save it as 'words.txt'.") return None # Remove keys that only have one word, as they have no anagrams # This is an optional optimization to reduce the size of the map final_map = {key: words for key, words in anagram_map.items() if len(words) > 1} print(f"Preprocessing complete. Found {len(final_map)} unique anagram groups.") return final_map def save_data_json(data, output_path='anagram_data.json'): """Saves the given data structure to a json file using pickle.""" if data is None: print("No data to save.") return with open(output_path, 'w', encoding='utf-8') as f: json.dump(data, f) print(f"Anagram map successfully saved to '{output_path}'.") def analyze_anagram_frequencies(anagram_map): """Analyzes and prints the frequency of anagram group sizes. Args: anagram_map (dict): The map of anagrams to analyze. """ if not anagram_map: print("Anagram map is empty. No frequencies to analyze.") return # Get the count of anagrams for each group (e.g., [2, 3, 2, 4, ...]) counts = [len(words) for words in anagram_map.values()] # Count the occurrences of each group size frequency = Counter(counts) print("\n--- Anagram Group Size Frequencies ---") for count, freq in sorted(frequency.items()): print(f"Groups with {count} anagrams: {freq}") print("--------------------------------------\n") if __name__ == '__main__': start_time = time.time() # Create the map from the dictionary file processed_data = create_anagram_map('words.txt') # Save the processed data for the main app to use if processed_data: # Analyze and display frequencies before saving analyze_anagram_frequencies(processed_data) save_data_json(processed_data) end_time = time.time() print(f"Total preprocessing time: {end_time - start_time:.2f} seconds.")