Commit 2fbf05c6 authored by Gregor Zimmermann's avatar Gregor Zimmermann

Final Push

parent e3f95e9e
import pandas as pd
import threading
import time
file_usage = threading.Semaphore(value=1)
ter_par = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\msr_aeg\\Assignment03\\Antlr_Parr.csv'
grammars = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\msr_aeg\\Assignment03\\raw_vec.csv'
vec_grammar = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\Vectorized_Grammars.csv'
par_vec_grammar = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\Antlr_Parr.csv'
path = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\compare.csv'
def compare_by_total_hash(file_1, file_2):
content_1 = pd.read_csv(file_1,usecols=['Date','Repo','FileName','Date','CompleteHash'])#funny it still loads everthing XD
content_2 = pd.read_csv(file_2,usecols=['Date','Repo','FileName','Date','CompleteHash'])
timed_vec = pd.read_csv(vec_grammar)
parr_vec = pd.read_csv(par_vec_grammar)
data_new =content_1.join(content_2.set_index('CompleteHash'), on='CompleteHash', lsuffix='Parr', rsuffix='Grammar',how='inner')
data_new.to_csv('C:\\Users\\Admin\\Documents\\Studium\\MSR\\msr_aeg\\Assignment03\\compare_by_total_hash.csv')
print(data_new)
#The funktion tellls me i only have 7 columns but it prints more into the file ?????
#metrics close/identical by hash for rule hash(norm) close/identical by rule count
def compare(df_1dim):
closest_dir = parr_vec.loc[1]
for index, line in parr_vec.iterrows():
if abs(df_1dim['RuleHash']-line['RuleHash'])<=abs(df_1dim['RuleHash']-closest_dir['RuleHash']):
closest_dir = line
df = pd.DataFrame({'Name': [df_1dim['Name']],
'Type': [df_1dim['Type']],
'TokenVocab': [df_1dim['TokenVocab']],
'RuleCount': [df_1dim['RuleCount']],
'RuleHash': [df_1dim['RuleHash']],
'Repo': [df_1dim['RuleHash']],
'FileName': [df_1dim['FileName']],
'Date': [df_1dim['Date']],
'ParrGramName': [closest_dir['Name']],
'Diff':[abs(closest_dir['RuleHash']-df_1dim['RuleHash'])],
'ParrGramHash': [closest_dir['RuleHash']],
'ParrGramRuleCount': [closest_dir['RuleCount']]})
file_usage.acquire()
df.to_csv(path, mode='a', header=False)
file_usage.release()
def loaddistributor():
df = pd.DataFrame({'Name':[], 'Type':[], 'TokenVocab':[], 'RuleCount':[], 'RuleHash':[],'Repo':[],'FileName':[],'Date':[],'ParrGramName':[],'Diff':[], 'ParrGramHash':[],'ParrGramRuleCount':[]})
df.to_csv(path, mode='w')
x=0
for index, line in timed_vec.iterrows():
try:
working_thread = threading.Thread(target=compare, args=[line])
working_thread.start()
#time.sleep(0.1)
print(x)
x+=1
time.sleep(0.2)
except:
print('Thread couldnt be started')
loaddistributor()
\ No newline at end of file
compare_by_total_hash(grammars, ter_par)
......@@ -12,9 +12,9 @@ def prep_func(raw_file, path):
accumulated_vec = vectorize(raw_str)
if accumulated_vec['Name'][0] != '':
file_path = raw_file.split('\\')
accumulated_vec['Repo']=file_path[-2]
accumulated_vec['FileName']=file_path[-1].split('-')[0]
accumulated_vec['Date']=file_path[-1].split('-')[-1][:-3]
accumulated_vec.insert(0,'Repo',file_path[-2])
accumulated_vec.insert(0,'FileName', file_path[-1].split('-')[0])
accumulated_vec.insert(0,'Date',file_path[-1].split('-')[-1][:-3])
file_usage.acquire()
accumulated_vec.to_csv(path, mode='a',header=False)
file_usage.release()
......@@ -40,8 +40,9 @@ def vectorize(data):
name = ''
teip =''
toVo=''
rule_hash = 0
hash_comp = 0
rule_count = 0
rules = pd.DataFrame()
for line in data.splitlines():
if 'tokenVocab' in line:
......@@ -52,19 +53,17 @@ def vectorize(data):
teip=x[0]
else:
rule_count += 1
rule_hash += hash(line)
rules['rule-'+str(rule_count)]=[hash(line)]
hash_comp +=hash(line)
if rule_count!=0:
norm_hash = rule_hash / rule_count
else:
norm_hash=0
df= pd.DataFrame({'Name':[name], 'Type':[teip], 'TokenVocab':[toVo], 'RuleCount':[rule_count], 'RuleHash':[norm_hash]})
df = pd.DataFrame({'Name':[name], 'Type':[teip], 'TokenVocab':[toVo], 'RuleCount':[rule_count],'CompleteHash':[hash_comp]})
df = pd.concat([df,rules],axis=1)
return df
def loaddistributor(filepath, aim_dir):
'C:\\Users\Admin\\Documents\\Studium\MSR\\test.csv'
df = pd.DataFrame({'Name':[], 'Type':[], 'TokenVocab':[], 'RuleCount':[], 'RuleHash':[],'Repo':[],'FileName':[],'Date':[]})
df = pd.DataFrame({'Date':[],'FileName':[],'Repo':[],'Name':[],'Type':[],'TokenVocab':[],'RuleCount':[],'CompleteHash':[]})
df.to_csv(aim_dir, mode='w')
for root, dirs, files in os.walk(filepath):
for file in files:
......@@ -75,4 +74,4 @@ def loaddistributor(filepath, aim_dir):
except:
print('Thread couldnt be started', os.path.join(root, file))
loaddistributor('C:\\Users\Admin\\Documents\\Studium\MSR\\datadump\\1\\','C:\\Users\Admin\\Documents\\Studium\MSR\\Antlr_Parr.csv')
\ No newline at end of file
loaddistributor('C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\','C:\\Users\Admin\\Documents\\Studium\MSR\\raw_vec.csv')
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment