Commit a1c605f6 authored by Gregor Zimmermann's avatar Gregor Zimmermann

up to date version

parent 212d94c3

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.

G Zim,DESKTOP-RVN2OU3/Admin,DESKTOP-RVN2OU3,26.03.2020 23:28,file:///C:/Users/Admin/AppData/Roaming/OpenOffice/4;
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (untitled2)" project-jdk-type="Python SDK" />
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<module fileurl="file://$PROJECT_DIR$/.idea/untitled2.iml" filepath="$PROJECT_DIR$/.idea/untitled2.iml" />
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RSettings" path="C:\Program Files\R\R-3.6.2\bin\R.exe" />
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
\ No newline at end of file
This diff is collapsed.
import threading
import pandas as pd
import os
import re
import time
from random import random
file_usage = threading.Semaphore(value=1)
def prep_func(raw_file, path):
raw_str = pre_clean(raw_file)
accumulated_vec = vectorize(raw_str)
file_path = raw_file.split('\\')
accumulated_vec.to_csv(path, mode='a',header=False)
#returns "naked" data without any comments or bodies
def pre_clean(file):
data = open(file,encoding="utf-8").read()
data = re.sub(re.compile('\'.*?\''), '', data) # removes Strings they could be misleading otherwise
data = re.sub(re.compile('/\*.*?\*/', re.DOTALL), '',data) # removes (/*COMMENT */)
data = re.sub(re.compile('//.*?\n'), '',data) # removes (//COMMENT\n )
data = re.sub(re.compile(':.*?;', re.DOTALL), '',data) # removes Rule bodies (need Strings to be removed first
data = re.sub(re.compile('@.*?}', re.DOTALL), '',data)
for el in ['fragment','mode','options',';','{','}']:
data = re.sub(el, '', data)
data = '\n'.join([line for line in data.splitlines() if line.strip()!='']) # removes empty lines
return data
# returns a Dataframe with file caracteristics
# grammar-name, type(grammar|parser|), tokenvocab,rule-count,rule-hash
def vectorize(data):
name = '-'
teip ='-'
rule_hash = 0
rule_count = 0
for line in data.splitlines():
if 'tokenVocab' in line:
if 'grammar' in line:
if len(x) ==3:
rule_count += 1
rule_hash += hash(line)
norm_hash = rule_hash / rule_count
df= pd.DataFrame({'Name':[name], 'Type':[teip], 'TokenVocab':[toVo], 'RuleCount':[rule_count], 'RuleHash':[rule_hash]})
return df
def loaddistributor(filepath, aim_dir):
for root, dirs, files in os.walk(filepath):
test = threading.Thread(target=prep_func, args=[files, aim_dir])
print('thread '+ str(x)+ ' started')
example_file = 'C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\Chris2011DotNetCore-for-NetBeans\\CSharpParser-1502210733.g4'
example_file_2= 'C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\Blazebitblaze-persistence\\JPQLSelectExpressionParser-1555255937.g4'
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
GitPython was originally written by Michael Trier.
GitPython 0.2 was partially (re)written by Sebastian Thiel, based on 0.1.6 and git-dulwich.
Contributors are:
-Michael Trier <mtrier _at_>
-Alan Briolat
-Florian Apolloner <florian _at_>
-David Aguilar <davvid _at_>
-Jelmer Vernooij <jelmer _at_>
-Steve Frécinaux <code _at_>
-Kai Lautaportti <kai _at_>
-Paul Sowden <paul _at_>
-Sebastian Thiel <byronimo _at_>
-Jonathan Chu <jonathan.chu _at_>
-Vincent Driessen <me _at_>
-Phil Elson <pelson _dot_ pub _at_>
-Bernard `Guyzmo` Pratz <>
-Timothy B. Hartman <tbhartman _at_>
-Konstantin Popov <konstantin.popov.89 _at_>
-Peter Jones <pjones _at_>
-Anson Mansfield <anson.mansfield _at_>
-Ken Odegard <ken.odegard _at_>
-Alexis Horgix Chotard
-Piotr Babij <piotr.babij _at_>
-Mikuláš Poul <mikulaspoul _at_>