import threading
import pandas as pd
import os
import re
import time
from random import random
file_usage = threading.Semaphore(value=1)
def prep_func(raw_file, path):
raw_str = pre_clean(raw_file)
accumulated_vec = vectorize(raw_str)
file_path = raw_file.split('\\')
accumulated_vec.to_csv(path, mode='a',header=False)
#returns "naked" data without any comments or bodies
def pre_clean(file):
data = open(file,encoding="utf-8").read()
data = re.sub(re.compile('\'.*?\''), '', data) # removes Strings they could be misleading otherwise
data = re.sub(re.compile('/\*.*?\*/', re.DOTALL), '',data) # removes (/*COMMENT */)
data = re.sub(re.compile('//.*?\n'), '',data) # removes (//COMMENT\n )
data = re.sub(re.compile(':.*?;', re.DOTALL), '',data) # removes Rule bodies (need Strings to be removed first
data = re.sub(re.compile('@.*?}', re.DOTALL), '',data)
for el in ['fragment','mode','options',';','{','}']:
data = re.sub(el, '', data)
data = '\n'.join([line for line in data.splitlines() if line.strip()!='']) # removes empty lines
return data
# returns a Dataframe with file caracteristics
# grammar-name, type(grammar|parser|), tokenvocab,rule-count,rule-hash
def vectorize(data):
name = '-'
teip ='-'
rule_hash = 0
rule_count = 0
for line in data.splitlines():
if 'tokenVocab' in line:
if 'grammar' in line:
if len(x) ==3:
rule_count += 1
rule_hash += hash(line)
norm_hash = rule_hash / rule_count
df= pd.DataFrame({'Name':[name], 'Type':[teip], 'TokenVocab':[toVo], 'RuleCount':[rule_count], 'RuleHash':[rule_hash]})
return df
def loaddistributor(filepath, aim_dir):
for root, dirs, files in os.walk(filepath):
test = threading.Thread(target=prep_func, args=[files, aim_dir])
print('thread '+ str(x)+ ' started')
example_file = 'C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\Chris2011DotNetCore-for-NetBeans\\CSharpParser-1502210733.g4'
example_file_2= 'C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\Blazebitblaze-persistence\\JPQLSelectExpressionParser-1555255937.g4'
\ No newline at end of file
GitPython was originally written by Michael Trier.
GitPython 0.2 was partially (re)written by Sebastian Thiel, based on 0.1.6 and git-dulwich.
Contributors are:
-Michael Trier <mtrier _at_>
-Alan Briolat
-Florian Apolloner <florian _at_>
-David Aguilar <davvid _at_>
-Jelmer Vernooij <jelmer _at_>
-Steve Frécinaux <code _at_>
-Kai Lautaportti <kai _at_>
-Paul Sowden <paul _at_>
-Sebastian Thiel <byronimo _at_>
-Jonathan Chu <jonathan.chu _at_>
-Vincent Driessen <me _at_>
-Phil Elson <pelson _dot_ pub _at_>
-Bernard `Guyzmo` Pratz <>
-Timothy B. Hartman <tbhartman _at_>
-Konstantin Popov <konstantin.popov.89 _at_>
-Peter Jones <pjones _at_>
-Anson Mansfield <anson.mansfield _at_>
-Ken Odegard <ken.odegard _at_>
-Alexis Horgix Chotard
-Piotr Babij <piotr.babij _at_>
-Mikuláš Poul <mikulaspoul _at_>