Commit 3b358e9a authored by Gregor Zimmermann's avatar Gregor Zimmermann

finalized version of the dl and prep scripts

parent a1c605f6
G Zim,DESKTOP-RVN2OU3/Admin,DESKTOP-RVN2OU3,26.03.2020 23:28,file:///C:/Users/Admin/AppData/Roaming/OpenOffice/4;
\ No newline at end of file
This diff is collapsed.
......@@ -3,7 +3,6 @@ import pandas as pd
import os
import re
import time
from random import random
file_usage = threading.Semaphore(value=1)
......@@ -11,18 +10,19 @@ def prep_func(raw_file, path):
raw_str = pre_clean(raw_file)
accumulated_vec = vectorize(raw_str)
file_path = raw_file.split('\\')
accumulated_vec['Repo']=file_path[-2]
accumulated_vec['FileName']=file_path[-1].split('-')[0]
accumulated_vec['Date']=file_path[-1].split('-')[-1][:-3]
file_usage.acquire()
accumulated_vec.to_csv(path, mode='a',header=False)
file_usage.release()
if accumulated_vec['Name'][0] != '':
file_path = raw_file.split('\\')
accumulated_vec['Repo']=file_path[-2]
accumulated_vec['FileName']=file_path[-1].split('-')[0]
accumulated_vec['Date']=file_path[-1].split('-')[-1][:-3]
file_usage.acquire()
accumulated_vec.to_csv(path, mode='a',header=False)
file_usage.release()
#returns "naked" data without any comments or bodies
def pre_clean(file):
data = open(file,encoding="utf-8").read()
data = open(file).read()
data = re.sub(re.compile('\'.*?\''), '', data) # removes Strings they could be misleading otherwise
data = re.sub(re.compile('/\*.*?\*/', re.DOTALL), '',data) # removes (/*COMMENT */)
......@@ -37,8 +37,8 @@ def pre_clean(file):
# returns a Dataframe with file caracteristics
# grammar-name, type(grammar|parser|), tokenvocab,rule-count,rule-hash
def vectorize(data):
name = '-'
teip ='-'
name = ''
teip =''
toVo=''
rule_hash = 0
rule_count = 0
......@@ -49,28 +49,30 @@ def vectorize(data):
if 'grammar' in line:
x=line.split()
name=x[-1].strip()
if len(x) ==3:
teip=x[-2].strip()
teip=x[0]
else:
rule_count += 1
rule_hash += hash(line)
norm_hash = rule_hash / rule_count
if rule_count!=0:
norm_hash = rule_hash / rule_count
else:
norm_hash=0
df= pd.DataFrame({'Name':[name], 'Type':[teip], 'TokenVocab':[toVo], 'RuleCount':[rule_count], 'RuleHash':[rule_hash]})
df= pd.DataFrame({'Name':[name], 'Type':[teip], 'TokenVocab':[toVo], 'RuleCount':[rule_count], 'RuleHash':[norm_hash]})
return df
def loaddistributor(filepath, aim_dir):
x=0
'C:\\Users\Admin\\Documents\\Studium\MSR\\test.csv'
df = pd.DataFrame({'Name':[], 'Type':[], 'TokenVocab':[], 'RuleCount':[], 'RuleHash':[],'Repo':[],'FileName':[],'Date':[]})
df.to_csv(aim_dir, mode='w')
for root, dirs, files in os.walk(filepath):
test = threading.Thread(target=prep_func, args=[files, aim_dir])
test.start()
print('thread '+ str(x)+ ' started')
x+=1
#mengenbegrenzung?
example_file = 'C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\Chris2011DotNetCore-for-NetBeans\\CSharpParser-1502210733.g4'
example_file_2= 'C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\Blazebitblaze-persistence\\JPQLSelectExpressionParser-1555255937.g4'
for file in files:
try:
working_thread = threading.Thread(target=prep_func, args=[os.path.join(root, file), aim_dir])
working_thread.start()
time.sleep(0.5)
except:
print('Thread couldnt be started', os.path.join(root, file))
loaddistributor('C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\','C:\\Users\Admin\\Documents\\Studium\MSR\\test.csv')
\ No newline at end of file
loaddistributor('C:\\Users\Admin\\Documents\\Studium\MSR\\datadump\\1\\','C:\\Users\Admin\\Documents\\Studium\MSR\\Antlr_Parr.csv')
\ No newline at end of file
This diff is collapsed.
......@@ -9,12 +9,12 @@ import stat
file = 'sample.csv'
path = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\datadump\\'
raw_data = pd.read_csv(file)
#raw_data = pd.read_csv(file)
raw_data =['antlr/grammars-v4']
counter = 1
for repo in raw_data['repository']:
for repo in raw_data:
rep_add = 'https://github.com/' + repo
path_temp = path + str(counter)
os.mkdir(path_temp)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment