Skip to content

Instantly share code, notes, and snippets.

@boyter
Created February 4, 2026 05:50
Show Gist options
  • Select an option

  • Save boyter/47d8dcc968a03edaecda196097c3822b to your computer and use it in GitHub Desktop.

Select an option

Save boyter/47d8dcc968a03edaecda196097c3822b to your computer and use it in GitHub Desktop.
import os
import sys
import re
import subprocess
import shutil
from pathlib import Path
# Mapping table to restrict scc to specific language extensions
LANGUAGE_EXTENSIONS = {
"ActionScript": "as",
"Clojure": "clj,cljc",
"C": "c,ec,pgc",
"CoffeeScript": "coffee",
"CPP": "cc,cpp,cxx,c++,pcc,ino,ccm,cppm,cxxm,c++m,mxx",
"CSharp": "cs,csx",
"CSS": "css",
"Dart": "dart",
"DM": "dm",
"Elixir": "ex,exs",
"Go": "go",
"Groovy": "groovy,grt,gtpl,gvy",
"Haskell": "hs",
"HTML": "html,htm",
"Java": "java",
"JavaScript": "js,cjs,mjs",
"Julia": "jl",
"Kotlin": "kt,kts",
"Lua": "lua",
"MATLAB": "m",
"Objective-C": "m",
"Perl": "pl,plx,pm",
"PHP": "php",
"PowerShell": "ps1,psm1",
"Python": "py,pyw,pyi",
"R": "r",
"Ruby": "rb",
"Rust": "rs",
"Scala": "sc,scala",
"Shell": "sh,.tcshrc",
"Swift": "swift",
"TeX": "tex,sty",
"TypeScript": "ts,tsx",
"Vim-script": "vim,vimrc,gvimrc,_vimrc,.vimrc,_gvimrc,.gvimrc,vimrc,gvimrc"
}
def extract_links(filepath):
"""Regex extraction of GitHub links from markdown."""
urls = []
github_regex = re.compile(r'https://github\.com/[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+')
try:
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
matches = github_regex.findall(line)
for m in matches:
link = m if m.endswith('.git') else f"{m}.git"
if link not in urls:
urls.append(link)
except Exception as e:
print(f" [!] Error reading {filepath.name}: {e}")
return urls
def run_analysis(input_dir, db_path):
first_run = not db_path.exists()
tmp_root = Path("/tmp")
search_path = Path(input_dir)
# Process files alphabetically
md_files = sorted(search_path.glob("*.md"))
for md_file in md_files:
lang_name = md_file.stem
# Skip top-100 files as they are mixed language
if lang_name.startswith("Top-100-"):
continue
include_arg = ""
if lang_name in LANGUAGE_EXTENSIONS:
include_arg = f"-i {LANGUAGE_EXTENSIONS[lang_name]}"
print(f"\n>>> Category: {lang_name} (Filter: {LANGUAGE_EXTENSIONS[lang_name]})")
else:
print(f"\n>>> Category: {lang_name} (No filter)")
links = extract_links(md_file)
for url in links:
# repo_name becomes the folder in /tmp and the 'Project' name in SQL
repo_name = url.split('/')[-1].replace('.git', '')
repo_path = tmp_root / repo_name
print(f" [+] Analyzing: {repo_name}")
# 1. Clone
try:
if repo_path.exists():
shutil.rmtree(repo_path)
subprocess.run(
["git", "clone", "--depth", "1", url, str(repo_path)],
check=True, capture_output=True
)
except subprocess.CalledProcessError:
print(f" [!] Failed to clone {url}")
continue
# 2. SCC to SQLite
# We toggle between 'sql' (schema + insert) and 'sql-insert' (just data)
scc_format = "sql" if first_run else "sql-insert"
# Change directory to /tmp so scc uses repo_name as the project column
# Pipe output to the absolute path of the code.db
shell_cmd = f"scc -u -a --format {scc_format} {include_arg} {repo_name} | sqlite3 {db_path}"
try:
subprocess.run(
shell_cmd,
cwd="/tmp",
shell=True,
check=True
)
first_run = False
except subprocess.CalledProcessError as e:
print(f" [!] SCC/SQLite error: {e}")
# 3. Cleanup
try:
shutil.rmtree(repo_path)
except:
pass
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python process_repos.py <directory_path>")
sys.exit(1)
# Database lives in the directory where the script is called
db_file = Path.cwd().resolve() / "code.db"
markdown_dir = sys.argv[1]
if not Path(markdown_dir).is_dir():
print(f"Error: {markdown_dir} is not a valid directory.")
sys.exit(1)
print(f"Starting Analysis...")
print(f"Output Database: {db_file}")
run_analysis(markdown_dir, db_file)
print(f"\nProcess Complete.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment