Created
May 9, 2021 08:44
-
-
Save UweZiegenhagen/8451df524a078da118346ae2cab202d6 to your computer and use it in GitHub Desktop.
Complete Medium 20210509 code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import collections | |
| import pandas as pd | |
| file1 = pd.read_csv('file1.csv', sep=';', encoding='UTF-8') | |
| file2 = pd.read_csv('file2.csv', sep=';', encoding='UTF-8') | |
| columnnames1 = list(file1) | |
| columnnames2 = list(file2) | |
| if collections.Counter(columnnames1) == collections.Counter(columnnames2): | |
| print ("Number of columns and Names match, Comparison possible...\n\n") | |
| else: | |
| print ("Number of columns and Names are not matching!!! Please check the input!") | |
| sys.exit('Error!') | |
| # add suffixes to distinguish between actual and expected in the merger | |
| file1 = file1.add_suffix('_e') # expected | |
| file2 = file2.add_suffix('_t') # t | |
| # merge them using the given key, use outer join | |
| comparison = pd.merge(file1,file2, how='outer', | |
| left_on=['Key_e'], | |
| right_on=['Key_t']) | |
| # create the columnwise comparison | |
| for col in columnnames1: | |
| comparison[(col + '_c')] = comparison[(col + '_t')] == comparison[(col + '_e')] | |
| # reorder the columns | |
| comparison=comparison.reindex(sorted(comparison.columns),axis=1) | |
| # save the result as Excel file | |
| comparison.to_excel('result.xlsx') | |
| # names of the comparison column | |
| check_colnames= [s + '_c' for s in columnnames1] | |
| # initialize an empty dataframe for the log | |
| logdf=pd.DataFrame(index=[True,False]) | |
| for column in check_colnames: | |
| t=comparison[column].value_counts() # returns a series | |
| tt=pd.DataFrame(t) # makes a DF out of the series | |
| logdf = logdf.join(tt,how='outer') # join the two dfs | |
| # transpose for better readability | |
| logdf = logdf.transpose() | |
| # Ensure fixed sequence of the columns | |
| logdf=logdf.reindex(sorted(logdf.columns),axis=1) | |
| # write to disk | |
| logdf.to_excel('logfile.xlsx') | |
| # for better viewing on the screen | |
| logdf.fillna('-',inplace=True) | |
| pd.options.display.float_format = '{:,.0f}'.format | |
| print(logdf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment