Created
February 4, 2026 05:50
-
-
Save boyter/47d8dcc968a03edaecda196097c3822b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import sys | |
| import re | |
| import subprocess | |
| import shutil | |
| from pathlib import Path | |
| # Mapping table to restrict scc to specific language extensions | |
| LANGUAGE_EXTENSIONS = { | |
| "ActionScript": "as", | |
| "Clojure": "clj,cljc", | |
| "C": "c,ec,pgc", | |
| "CoffeeScript": "coffee", | |
| "CPP": "cc,cpp,cxx,c++,pcc,ino,ccm,cppm,cxxm,c++m,mxx", | |
| "CSharp": "cs,csx", | |
| "CSS": "css", | |
| "Dart": "dart", | |
| "DM": "dm", | |
| "Elixir": "ex,exs", | |
| "Go": "go", | |
| "Groovy": "groovy,grt,gtpl,gvy", | |
| "Haskell": "hs", | |
| "HTML": "html,htm", | |
| "Java": "java", | |
| "JavaScript": "js,cjs,mjs", | |
| "Julia": "jl", | |
| "Kotlin": "kt,kts", | |
| "Lua": "lua", | |
| "MATLAB": "m", | |
| "Objective-C": "m", | |
| "Perl": "pl,plx,pm", | |
| "PHP": "php", | |
| "PowerShell": "ps1,psm1", | |
| "Python": "py,pyw,pyi", | |
| "R": "r", | |
| "Ruby": "rb", | |
| "Rust": "rs", | |
| "Scala": "sc,scala", | |
| "Shell": "sh,.tcshrc", | |
| "Swift": "swift", | |
| "TeX": "tex,sty", | |
| "TypeScript": "ts,tsx", | |
| "Vim-script": "vim,vimrc,gvimrc,_vimrc,.vimrc,_gvimrc,.gvimrc,vimrc,gvimrc" | |
| } | |
| def extract_links(filepath): | |
| """Regex extraction of GitHub links from markdown.""" | |
| urls = [] | |
| github_regex = re.compile(r'https://github\.com/[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+') | |
| try: | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| matches = github_regex.findall(line) | |
| for m in matches: | |
| link = m if m.endswith('.git') else f"{m}.git" | |
| if link not in urls: | |
| urls.append(link) | |
| except Exception as e: | |
| print(f" [!] Error reading {filepath.name}: {e}") | |
| return urls | |
| def run_analysis(input_dir, db_path): | |
| first_run = not db_path.exists() | |
| tmp_root = Path("/tmp") | |
| search_path = Path(input_dir) | |
| # Process files alphabetically | |
| md_files = sorted(search_path.glob("*.md")) | |
| for md_file in md_files: | |
| lang_name = md_file.stem | |
| # Skip top-100 files as they are mixed language | |
| if lang_name.startswith("Top-100-"): | |
| continue | |
| include_arg = "" | |
| if lang_name in LANGUAGE_EXTENSIONS: | |
| include_arg = f"-i {LANGUAGE_EXTENSIONS[lang_name]}" | |
| print(f"\n>>> Category: {lang_name} (Filter: {LANGUAGE_EXTENSIONS[lang_name]})") | |
| else: | |
| print(f"\n>>> Category: {lang_name} (No filter)") | |
| links = extract_links(md_file) | |
| for url in links: | |
| # repo_name becomes the folder in /tmp and the 'Project' name in SQL | |
| repo_name = url.split('/')[-1].replace('.git', '') | |
| repo_path = tmp_root / repo_name | |
| print(f" [+] Analyzing: {repo_name}") | |
| # 1. Clone | |
| try: | |
| if repo_path.exists(): | |
| shutil.rmtree(repo_path) | |
| subprocess.run( | |
| ["git", "clone", "--depth", "1", url, str(repo_path)], | |
| check=True, capture_output=True | |
| ) | |
| except subprocess.CalledProcessError: | |
| print(f" [!] Failed to clone {url}") | |
| continue | |
| # 2. SCC to SQLite | |
| # We toggle between 'sql' (schema + insert) and 'sql-insert' (just data) | |
| scc_format = "sql" if first_run else "sql-insert" | |
| # Change directory to /tmp so scc uses repo_name as the project column | |
| # Pipe output to the absolute path of the code.db | |
| shell_cmd = f"scc -u -a --format {scc_format} {include_arg} {repo_name} | sqlite3 {db_path}" | |
| try: | |
| subprocess.run( | |
| shell_cmd, | |
| cwd="/tmp", | |
| shell=True, | |
| check=True | |
| ) | |
| first_run = False | |
| except subprocess.CalledProcessError as e: | |
| print(f" [!] SCC/SQLite error: {e}") | |
| # 3. Cleanup | |
| try: | |
| shutil.rmtree(repo_path) | |
| except: | |
| pass | |
| if __name__ == "__main__": | |
| if len(sys.argv) < 2: | |
| print("Usage: python process_repos.py <directory_path>") | |
| sys.exit(1) | |
| # Database lives in the directory where the script is called | |
| db_file = Path.cwd().resolve() / "code.db" | |
| markdown_dir = sys.argv[1] | |
| if not Path(markdown_dir).is_dir(): | |
| print(f"Error: {markdown_dir} is not a valid directory.") | |
| sys.exit(1) | |
| print(f"Starting Analysis...") | |
| print(f"Output Database: {db_file}") | |
| run_analysis(markdown_dir, db_file) | |
| print(f"\nProcess Complete.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment