Skip to content

Instantly share code, notes, and snippets.

@josh-hernandez-exe
Created April 30, 2021 09:29
Show Gist options
  • Select an option

  • Save josh-hernandez-exe/556ad97019e95f70f9baaf4e810091e4 to your computer and use it in GitHub Desktop.

Select an option

Save josh-hernandez-exe/556ad97019e95f70f9baaf4e810091e4 to your computer and use it in GitHub Desktop.
SciPy based Spare Matrix Implementation of PageRank
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import networkx as nx\n",
"import numpy as np\n",
"import pandas as pd\n",
"import scipy as sp\n",
"import scipy.sparse\n",
"import scipy.sparse.linalg\n",
"\n",
"from IPython.display import display\n",
"from matplotlib import pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def random_graph(n, density):\n",
" return sp.sparse.random(\n",
" n,n,\n",
" data_rvs=lambda x: np.ones(x),\n",
" dtype=np.int8,\n",
" density=density,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def is_strongly_connected(graph, initial_node = 0):\n",
" \"\"\"\n",
" Interpert matrix as an adjacency matrix\n",
" https://en.wikipedia.org/wiki/Strongly_connected_component\n",
" \"\"\"\n",
" row, col = graph.shape\n",
"\n",
" if row != col:\n",
" raise Exception(\"Matrix is not square\")\n",
" \n",
" is_visited = np.zeros(row, dtype=bool)\n",
" \n",
" # more efficient row operations\n",
" _graph = graph.tocsr()\n",
" stack = [initial_node]\n",
" \n",
" is_first = True\n",
"\n",
" while stack:\n",
" cur_idx = stack.pop()\n",
"\n",
" if is_visited[cur_idx]:\n",
" continue\n",
"\n",
" is_visited[cur_idx] = True\n",
" \n",
" if is_first:\n",
" is_first = False\n",
" # we want to see if we can arrive back to ourselves\n",
" # this is to make sure that the first node so happens\n",
" # to only have outgoing edges\n",
" is_visited[cur_idx] = True\n",
"\n",
" # specifically use row slices as csr matries are\n",
" # better for that\n",
" _, outound_to_vist = _graph[cur_idx,:].nonzero()\n",
"\n",
" stack.extend(outound_to_vist)\n",
"\n",
" return np.all(is_visited)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def random_nice_graph(n, density=0.1, limit=1000):\n",
" graph = None\n",
"\n",
" for _ in range(limit):\n",
" graph = random_graph(n, density)\n",
"\n",
" # remove loops\n",
" graph -= sp.sparse.diags(graph.diagonal())\n",
"\n",
" # check for strongly connected and that all row sums are positive\n",
" if is_strongly_connected(graph) and np.all(np.sum(graph, axis=1) > 0):\n",
" break\n",
" else:\n",
" raise Exception(\"Iteration limit reached!\")\n",
"\n",
" return graph"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def eigenvector_centrality(graph):\n",
" # we need the matrix as a float type\n",
" # https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.asfptype.html#scipy-sparse-coo-matrix-asfptype\n",
" e_vals, e_vects = sp.sparse.linalg.eigs(graph.asfptype())\n",
" e_val_max_idx = np.argmax(np.abs(e_vals))\n",
"\n",
" # Note Perron–Frobenius theorem\n",
" # we apply the case for non-negative as adjacency matrices are non negative\n",
" # https://en.wikipedia.org/wiki/Perron%E2%80%93Frobenius_theorem#Non-negative_matrices_2 \n",
"\n",
" if not np.isclose(e_vals[e_val_max_idx].imag, 0):\n",
" raise Error(\"Largest Eigenvalue is not real.\")\n",
" \n",
" largest_e_val = e_vals[e_val_max_idx].real\n",
" largest_e_vec = e_vects[:, e_val_max_idx].real\n",
" \n",
" if largest_e_vec[0] < 0:\n",
" largest_e_vec = -largest_e_vec\n",
" \n",
" largest_e_vec /= np.linalg.norm(largest_e_vec)\n",
"\n",
" return largest_e_val, largest_e_vec"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def normalized_adjacency_matrix(graph, fix_sinks=False):\n",
" rows, cols = graph.shape\n",
" one_vec = np.ones([rows,1])\n",
" out_degree = graph @ one_vec\n",
" non_zero = ~np.isclose(out_degree, 0)\n",
"\n",
" # any rows of zero will be divided by one\n",
" out_degree_inv = np.ones([rows,1])\n",
" out_degree_inv[non_zero] = 1/out_degree[non_zero]\n",
"\n",
" out_degree_mat = sp.sparse.diags(out_degree_inv.flatten())\n",
"\n",
" N = out_degree_mat @ graph\n",
"\n",
" if fix_sinks:\n",
" N = N.tocsr()\n",
" \n",
" for ii, [is_zero] in enumerate(~non_zero):\n",
" if is_zero:\n",
" N[ii,:] = np.ones([rows,1]) / rows\n",
"\n",
" return N"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def normlized_eigenvector_centrality(graph, fix_sinks=True):\n",
" N = normalized_adjacency_matrix(graph, fix_sinks=fix_sinks)\n",
"\n",
" _, x = eigenvector_centrality(N.T)\n",
"\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def page_rank(graph, alpha = 0.85, attention_distribution=None, initial_vec=None, tolerance=10e-10):\n",
" \"\"\"\n",
" \"\"\"\n",
" if not (0 <= alpha <= 1):\n",
" raise Exception()\n",
"\n",
" rows, cols = graph.shape\n",
"\n",
" v = None\n",
"\n",
" if attention_distribution is not None:\n",
" v = attention_distribution\n",
" else:\n",
" # uniform distribution\n",
" v = np.ones([rows, 1]) / rows\n",
" \n",
" \n",
" one_vec = np.ones([rows,1])\n",
" \n",
" N = normalized_adjacency_matrix(graph)\n",
" P = N.T\n",
" \n",
" curr_vec = None\n",
" prev_vec = None\n",
"\n",
" if initial_vec is not None:\n",
" curr_vec = initial_vec\n",
" else:\n",
" # litrature says this is a good default\n",
" curr_vec = v\n",
" \n",
" def is_done(xx,yy):\n",
" if xx is None: return False\n",
" if yy is None: return False\n",
" \n",
" return np.all(np.abs(xx-yy) < tolerance)\n",
"\n",
" # normalize values to sum to 1 before we start\n",
" curr_vec /= np.sum(curr_vec)\n",
" \n",
" while not is_done(curr_vec, prev_vec):\n",
" prev_vec = curr_vec\n",
" curr_vec = alpha * P @ curr_vec + (1-alpha) * v\n",
" \n",
" # normalize values to sum to 1\n",
" curr_vec /= np.sum(curr_vec)\n",
"\n",
" return curr_vec"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"graph_1 = random_nice_graph(n=25, density=0.09)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 18.2 ms, sys: 9.63 ms, total: 27.9 ms\n",
"Wall time: 63.3 ms\n"
]
}
],
"source": [
"%%time\n",
"a_1, b_1 = eigenvector_centrality(graph_1.T)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 54.1 ms, sys: 9.93 ms, total: 64.1 ms\n",
"Wall time: 47 ms\n"
]
}
],
"source": [
"%%time\n",
"c_1 = normlized_eigenvector_centrality(graph_1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 16.2 ms, sys: 99 µs, total: 16.3 ms\n",
"Wall time: 14.1 ms\n"
]
}
],
"source": [
"%%time\n",
"x_1 = page_rank(graph_1, alpha=0.85)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"in_degree_1 = np.array(np.sum(graph_1, axis=0)).flatten()\n",
"out_degree_1 = np.array(np.sum(graph_1, axis=1)).flatten()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"in_degree[3] = 4.0\n"
]
}
],
"source": [
"max_in_degree_idx = np.argmax(in_degree_1)\n",
"print(f\"in_degree[{max_in_degree_idx}] = {in_degree_1[max_in_degree_idx]}\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"out_degree[2] = 6.0\n"
]
}
],
"source": [
"max_out_degree_idx = np.argmax(out_degree_1)\n",
"print(f\"out_degree[{max_out_degree_idx}] = {out_degree_1[max_out_degree_idx]}\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"transpose_e_vector_centrality[17] = 0.30777879969217914\n",
"in_degree[17] = 3.0\n"
]
}
],
"source": [
"max_t_e_vec_cent_idx = np.argmax(b_1)\n",
"print(f\"transpose_e_vector_centrality[{max_t_e_vec_cent_idx}] = {b_1[max_t_e_vec_cent_idx]}\")\n",
"print(f\"in_degree[{max_t_e_vec_cent_idx}] = {in_degree_1[max_t_e_vec_cent_idx]}\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"normalized_e_vector_centrality[16] = 0.372656592018923\n",
"in_degree[16] = 4.0\n"
]
}
],
"source": [
"max_e_vec_normed_cent_idx = np.argmax(c_1)\n",
"print(f\"normalized_e_vector_centrality[{max_e_vec_normed_cent_idx}] = {c_1[max_e_vec_normed_cent_idx]}\")\n",
"print(f\"in_degree[{max_e_vec_normed_cent_idx}] = {in_degree_1[max_e_vec_normed_cent_idx]}\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"page_rank[16] = [0.08246351]\n",
"in_degree[16] = 4.0\n"
]
}
],
"source": [
"max_page_rank_idx = np.argmax(x_1)\n",
"print(f\"page_rank[{max_page_rank_idx}] = {x_1[max_page_rank_idx]}\")\n",
"print(f\"in_degree[{max_page_rank_idx}] = {in_degree_1[max_page_rank_idx]}\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def plot_graph(nx_graph, node_positions, score, label, cmap=plt.cm.winter):\n",
"\n",
" fig, ax = plt.subplots(dpi=600, figsize=(12,9))\n",
"\n",
" vmin=np.min(score)\n",
" vmax=np.max(score)\n",
"\n",
" # https://networkx.org/documentation/stable/reference/drawing.html\n",
" nx.draw_networkx(nx_graph,\n",
" pos=node_positions,\n",
" node_color=score,\n",
" vmin=vmin, vmax=vmax,\n",
" cmap=plt.cm.winter, ax=ax,\n",
" )\n",
"\n",
" sm = plt.cm.ScalarMappable(cmap=plt.cm.winter, norm=plt.Normalize(vmin=vmin, vmax=vmax))\n",
" cbar = plt.colorbar(sm)\n",
" cbar.ax.get_yaxis().labelpad = 15\n",
" cbar.ax.set_ylabel(label,rotation=270)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"nx_graph_1 = nx.DiGraph()\n",
"\n",
"for ii in range(graph_1.shape[0]):\n",
" nx_graph_1.add_node(ii)\n",
"\n",
"for (ii,jj) in zip(*graph_1.nonzero()):\n",
" # https://networkx.org/documentation/stable/reference/classes/generated/networkx.DiGraph.add_edge.html\n",
" nx_graph_1.add_edge(ii,jj) # src ii to dst jj"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"node_positions = nx.spring_layout(nx_graph_1)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment