boyter · February 4, 2026 05:50
diff --git a/process_repos.py b/process_repos.py
 import os
 import sys
 import re
 import subprocess
 import shutil
 from pathlib import Path

 # Mapping table to restrict scc to specific language extensions
 LANGUAGE_EXTENSIONS = {
    "ActionScript": "as",
    "Clojure": "clj,cljc",
    "C": "c,ec,pgc",
    "CoffeeScript": "coffee",
    "CPP": "cc,cpp,cxx,c++,pcc,ino,ccm,cppm,cxxm,c++m,mxx",
    "CSharp": "cs,csx",
    "CSS": "css",
    "Dart": "dart",
    "DM": "dm",
    "Elixir": "ex,exs",
    "Go": "go",
    "Groovy": "groovy,grt,gtpl,gvy",
    "Haskell": "hs",
    "HTML": "html,htm",
    "Java": "java",
    "JavaScript": "js,cjs,mjs",
    "Julia": "jl",
    "Kotlin": "kt,kts",
    "Lua": "lua",
    "MATLAB": "m",
    "Objective-C": "m",
    "Perl": "pl,plx,pm",
    "PHP": "php",
    "PowerShell": "ps1,psm1",
    "Python": "py,pyw,pyi",
    "R": "r",
    "Ruby": "rb",
    "Rust": "rs",
    "Scala": "sc,scala",
    "Shell": "sh,.tcshrc",
    "Swift": "swift",
    "TeX": "tex,sty",
    "TypeScript": "ts,tsx",
    "Vim-script": "vim,vimrc,gvimrc,_vimrc,.vimrc,_gvimrc,.gvimrc,vimrc,gvimrc"
 }

 def extract_links(filepath):
    """Regex extraction of GitHub links from markdown."""
    urls = []
    github_regex = re.compile(r'https://github\.com/[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+')
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                matches = github_regex.findall(line)
                for m in matches:
                    link = m if m.endswith('.git') else f"{m}.git"
                    if link not in urls:
                        urls.append(link)
    except Exception as e:
        print(f"    [!] Error reading {filepath.name}: {e}")
    return urls

 def run_analysis(input_dir, db_path):
    first_run = not db_path.exists()
    tmp_root = Path("/tmp")
    search_path = Path(input_dir)
    
    # Process files alphabetically
    md_files = sorted(search_path.glob("*.md"))

    for md_file in md_files:
        lang_name = md_file.stem
        
        # Skip top-100 files as they are mixed language
        if lang_name.startswith("Top-100-"):
            continue

        include_arg = ""
        if lang_name in LANGUAGE_EXTENSIONS:
            include_arg = f"-i {LANGUAGE_EXTENSIONS[lang_name]}"
            print(f"\n>>> Category: {lang_name} (Filter: {LANGUAGE_EXTENSIONS[lang_name]})")
        else:
            print(f"\n>>> Category: {lang_name} (No filter)")

        links = extract_links(md_file)
        
        for url in links:
            # repo_name becomes the folder in /tmp and the 'Project' name in SQL
            repo_name = url.split('/')[-1].replace('.git', '')
            repo_path = tmp_root / repo_name

            print(f"    [+] Analyzing: {repo_name}")

            # 1. Clone
            try:
                if repo_path.exists():
                    shutil.rmtree(repo_path)
                
                subprocess.run(
                    ["git", "clone", "--depth", "1", url, str(repo_path)],
                    check=True, capture_output=True
                )
            except subprocess.CalledProcessError:
                print(f"        [!] Failed to clone {url}")
                continue

            # 2. SCC to SQLite
            # We toggle between 'sql' (schema + insert) and 'sql-insert' (just data)
            scc_format = "sql" if first_run else "sql-insert"
            
            # Change directory to /tmp so scc uses repo_name as the project column
            # Pipe output to the absolute path of the code.db
            shell_cmd = f"scc -u -a --format {scc_format} {include_arg} {repo_name} | sqlite3 {db_path}"
            
            try:
                subprocess.run(
                    shell_cmd,
                    cwd="/tmp",
                    shell=True,
                    check=True
                )
                first_run = False
            except subprocess.CalledProcessError as e:
                print(f"        [!] SCC/SQLite error: {e}")

            # 3. Cleanup
            try:
                shutil.rmtree(repo_path)
            except:
                pass

 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python process_repos.py <directory_path>")
        sys.exit(1)

    # Database lives in the directory where the script is called
    db_file = Path.cwd().resolve() / "code.db"
    markdown_dir = sys.argv[1]

    if not Path(markdown_dir).is_dir():
        print(f"Error: {markdown_dir} is not a valid directory.")
        sys.exit(1)

    print(f"Starting Analysis...")
    print(f"Output Database: {db_file}")
    
    run_analysis(markdown_dir, db_file)
    
    print(f"\nProcess Complete.")
	import os
	import sys
	import re
	import subprocess
	import shutil
	from pathlib import Path

	# Mapping table to restrict scc to specific language extensions
	LANGUAGE_EXTENSIONS = {
	"ActionScript": "as",
	"Clojure": "clj,cljc",
	"C": "c,ec,pgc",
	"CoffeeScript": "coffee",
	"CPP": "cc,cpp,cxx,c++,pcc,ino,ccm,cppm,cxxm,c++m,mxx",
	"CSharp": "cs,csx",
	"CSS": "css",
	"Dart": "dart",
	"DM": "dm",
	"Elixir": "ex,exs",
	"Go": "go",
	"Groovy": "groovy,grt,gtpl,gvy",
	"Haskell": "hs",
	"HTML": "html,htm",
	"Java": "java",
	"JavaScript": "js,cjs,mjs",
	"Julia": "jl",
	"Kotlin": "kt,kts",
	"Lua": "lua",
	"MATLAB": "m",
	"Objective-C": "m",
	"Perl": "pl,plx,pm",
	"PHP": "php",
	"PowerShell": "ps1,psm1",
	"Python": "py,pyw,pyi",
	"R": "r",
	"Ruby": "rb",
	"Rust": "rs",
	"Scala": "sc,scala",
	"Shell": "sh,.tcshrc",
	"Swift": "swift",
	"TeX": "tex,sty",
	"TypeScript": "ts,tsx",
	"Vim-script": "vim,vimrc,gvimrc,_vimrc,.vimrc,_gvimrc,.gvimrc,vimrc,gvimrc"
	}

	def extract_links(filepath):
	"""Regex extraction of GitHub links from markdown."""
	urls = []
	github_regex = re.compile(r'https://github\.com/[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+')
	try:
	with open(filepath, 'r', encoding='utf-8') as f:
	for line in f:
	matches = github_regex.findall(line)
	for m in matches:
	link = m if m.endswith('.git') else f"{m}.git"
	if link not in urls:
	urls.append(link)
	except Exception as e:
	print(f" [!] Error reading {filepath.name}: {e}")
	return urls

	def run_analysis(input_dir, db_path):
	first_run = not db_path.exists()
	tmp_root = Path("/tmp")
	search_path = Path(input_dir)

	# Process files alphabetically
	md_files = sorted(search_path.glob("*.md"))

	for md_file in md_files:
	lang_name = md_file.stem

	# Skip top-100 files as they are mixed language
	if lang_name.startswith("Top-100-"):
	continue

	include_arg = ""
	if lang_name in LANGUAGE_EXTENSIONS:
	include_arg = f"-i {LANGUAGE_EXTENSIONS[lang_name]}"
	print(f"\n>>> Category: {lang_name} (Filter: {LANGUAGE_EXTENSIONS[lang_name]})")
	else:
	print(f"\n>>> Category: {lang_name} (No filter)")

	links = extract_links(md_file)

	for url in links:
	# repo_name becomes the folder in /tmp and the 'Project' name in SQL
	repo_name = url.split('/')[-1].replace('.git', '')
	repo_path = tmp_root / repo_name

	print(f" [+] Analyzing: {repo_name}")

	# 1. Clone
	try:
	if repo_path.exists():
	shutil.rmtree(repo_path)

	subprocess.run(
	["git", "clone", "--depth", "1", url, str(repo_path)],
	check=True, capture_output=True
	)
	except subprocess.CalledProcessError:
	print(f" [!] Failed to clone {url}")
	continue

	# 2. SCC to SQLite
	# We toggle between 'sql' (schema + insert) and 'sql-insert' (just data)
	scc_format = "sql" if first_run else "sql-insert"

	# Change directory to /tmp so scc uses repo_name as the project column
	# Pipe output to the absolute path of the code.db
	shell_cmd = f"scc -u -a --format {scc_format} {include_arg} {repo_name} \| sqlite3 {db_path}"

	try:
	subprocess.run(
	shell_cmd,
	cwd="/tmp",
	shell=True,
	check=True
	)
	first_run = False
	except subprocess.CalledProcessError as e:
	print(f" [!] SCC/SQLite error: {e}")

	# 3. Cleanup
	try:
	shutil.rmtree(repo_path)
	except:
	pass

	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print("Usage: python process_repos.py <directory_path>")
	sys.exit(1)

	# Database lives in the directory where the script is called
	db_file = Path.cwd().resolve() / "code.db"
	markdown_dir = sys.argv[1]

	if not Path(markdown_dir).is_dir():
	print(f"Error: {markdown_dir} is not a valid directory.")
	sys.exit(1)

	print(f"Starting Analysis...")
	print(f"Output Database: {db_file}")

	run_analysis(markdown_dir, db_file)

	print(f"\nProcess Complete.")
No results found