bench_generator.raw_data

This module holds the RawData class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.

  1#!/usr/bin/env python3
  2
  3"""
  4This module holds the RawData class which scales the dataset size
  5by the number of members in a dataset such as number of rows for tabular data.
  6"""
  7
  8import os
  9import string
 10from pandas import DataFrame
 11from rdflib import Graph, Literal, Namespace
 12from bench_generator.scenario import Scenario
 13from bench_generator.logger import Logger
 14
 15DATA_FILE = 'data.csv'
 16RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
 17CSV_MAPPING_FILE = 'mapping.rml.ttl'
 18R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 19RML = Namespace('http://semweb.mmlab.be/ns/rml#')
 20QL = Namespace('http://semweb.mmlab.be/ns/ql#')
 21EX = Namespace('http://example.com/')
 22
 23
 24class RawData(Scenario):
 25    def __init__(self, main_directory: str, verbose: bool,
 26                 number_of_members: int, number_of_properties: int,
 27                 value_size: int, data_format: str, engine: str):
 28        """Initialize a Raw Data scenario.
 29
 30        Parameters
 31        ----------
 32        main_directory : str
 33            Root directory for generating instances of Raw Data.
 34        verbose : bool
 35            Verbose logging enabled or not.
 36        number_of_members : int
 37            Number of members to generate, for example 5000 for 5K rows in a
 38            tabular data structure.
 39        number_of_properties : int
 40            Number of properties per member to generate, for example 20 for
 41            20 columns in a tabular data structure.
 42        value_size : int
 43            Number of characters to add to default value generation,
 44            for example: 256 will expand all values to 256 characters.
 45        data_format : str
 46            Data format to use for generating the data set, for example:
 47            "csv", "json", "xml", "postgresql", "mysql"
 48        engine : str
 49            Engine to use for execution of the generated scenario's instance,
 50            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 51            or "OntopMaterialize"
 52        """
 53        self._number_of_members: int = number_of_members
 54        self._number_of_properties: int = number_of_properties
 55        self._value_size: int = value_size
 56
 57        super().__init__(data_format, engine, main_directory, verbose)
 58        self._logger = Logger(__name__, self._main_directory, self._verbose)
 59
 60    def generate(self) -> bool:
 61        """Generate the instance using the Raw Data scenario.
 62
 63        Only CSV files are currently implemented!
 64        """
 65        if self._data_format == 'csv':
 66            return self._generate_csv()
 67        elif self._data_format == 'postgresql':
 68            return self._generate_postgresql()
 69        else:
 70            raise NotImplementedError(f'Data format {self._data_format} '
 71                                      f'is not implemented by {__name__}')
 72
 73    def path(self) -> str:
 74        """Builds the file path for the instance of a Raw Data scenario.
 75
 76        Returns
 77        -------
 78        path : str
 79            File path for the Raw Data's instance.
 80        """
 81        key = f'raw_{self._number_of_members}_' \
 82              f'{self._number_of_properties}_{self._value_size}'
 83        path = os.path.join(self._main_directory, self._engine,
 84                            self._data_format, key)
 85        self._logger.debug(f'Generating to {path}')
 86        os.makedirs(path, exist_ok=True)
 87        return path
 88
 89    def _generate_dataframe(self, member_offset: int = 1,
 90                            property_offset: int = 1) -> DataFrame:
 91        """Generate raw data.
 92
 93        Parameters
 94        ----------
 95        member_offset : int
 96            Offset to start member ID generation from. Default 1 (no offset).
 97        property_offset : int
 98            Offset to start property ID generation from. Default 1 (no offset).
 99
100        Returns
101        -------
102        dataframe : DataFrame
103            Panda's DataFrame with generated raw data.
104        """
105        subject_id = range(member_offset,
106                           self._number_of_members + member_offset)
107        value_id = range(property_offset,
108                         self._number_of_members + property_offset)
109        data: dict = {'id': subject_id}
110        n_ascii = len(string.ascii_letters)
111
112        for j in range(1, self._number_of_properties + 1):
113            # Append ASCII characters if necessary, use modulo to avoid out of
114            # range in ASCII table
115            append_value = ''
116            if self._value_size > 0:
117                append_value = '_'
118            for n in range(self._value_size):
119                append_value += string.ascii_letters[n % n_ascii]
120
121            # Generate value V_{property}_{member} honoring the value size
122            value = [f'V_{j}-{i}{append_value}' for i in value_id]
123            data[f'p{j}'] = value
124
125        return DataFrame(data)
126
127    def _generate_mapping(self) -> Graph:
128        """Generate a [R2]RML mapping for a RawData instance.
129
130        Returns
131        -------
132        mapping : Graph
133            [R2]RML mapping as an RDFLib Graph.
134        """
135        mapping: Graph = Graph(base='http://ex.com/')
136        mapping.bind('rr', R2RML)
137        mapping.bind('rml', RML)
138        mapping.bind('ql', QL)
139        mapping.bind('ex', EX)
140        subject_template = Literal('http://ex.com/table/{id}')
141
142        if self._data_format == 'postgresql':
143            triples_map_iri = self._add_triples_map_table(mapping,
144                                                          subject_template,
145                                                          Literal('data'))
146        elif self._data_format == 'csv':
147            triples_map_iri = \
148                self._add_triples_map_source(mapping, subject_template,
149                                             Literal('/data/shared/data.csv'))
150        else:
151            raise NotImplementedError(f'{self._data_format} not implemented')
152
153        for i in range(1, self._number_of_properties + 1):
154            self._add_predicate_object_map(mapping, triples_map_iri,
155                                           EX[f'p{i}'], Literal(f'p{i}'))
156
157        return mapping
158
159    def _generate_csv(self) -> bool:
160        """Generate the instance as CSV files.
161
162        Returns
163        -------
164        success : bool
165            True if successfull, false otherwise
166        """
167        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
168        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
169        self._generate_dataframe().to_csv(data_path, index=False)
170
171        mapping_path = os.path.join(self.path(), 'data', 'shared',
172                                    CSV_MAPPING_FILE)
173        mapping: Graph = self._generate_mapping()
174        mapping.serialize(destination=mapping_path, format='turtle')
175        self._generate_scenario()
176
177        return True
178
179    def _generate_postgresql(self) -> bool:
180        """Generate the instance as PostgreSQL with CSV files to load.
181
182        Returns
183        -------
184        success : bool
185            True if successfull, false otherwise
186        """
187        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
188        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
189        self._generate_dataframe().to_csv(data_path, index=False)
190
191        mapping_path = os.path.join(self.path(), 'data', 'shared',
192                                    RDB_MAPPING_FILE)
193        mapping: Graph = self._generate_mapping()
194        mapping.serialize(destination=mapping_path, format='turtle')
195        self._generate_scenario()
196
197        return True
198
199    def _generate_scenario(self) -> bool:
200        """Generate the metadata for this scenario.
201
202        Configures the execution pipeline automatically.
203
204        Returns
205        -------
206        success : bool
207            True if successfull, false otherwise
208        """
209        name: str = f'raw_{self._number_of_members}_' + \
210                    f'{self._number_of_properties}_{self._value_size}'
211        description: str = f'Raw Data Values {self._number_of_members} ' + \
212                           f'members, {self._number_of_properties} ' + \
213                           f'properties, and {self._value_size} value size'
214        iri: str = f'http://example.org/raw/{self._number_of_members}/' + \
215                   f'{self._number_of_properties}/{self._value_size}'
216
217        if self._data_format == 'postgresql':
218            return self._generate_metadata(iri, name, description,
219                                           RDB_MAPPING_FILE)
220        elif self._data_format == 'csv':
221            return self._generate_metadata(iri, name, description,
222                                           CSV_MAPPING_FILE)
223        else:
224            raise NotImplementedError(f'{self._data_format} not implemented')
225
226        return False
DATA_FILE = 'data.csv'
RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
CSV_MAPPING_FILE = 'mapping.rml.ttl'
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
RML = Namespace('http://semweb.mmlab.be/ns/rml#')
QL = Namespace('http://semweb.mmlab.be/ns/ql#')
EX = Namespace('http://example.com/')
class RawData(bench_generator.scenario.Scenario):
 25class RawData(Scenario):
 26    def __init__(self, main_directory: str, verbose: bool,
 27                 number_of_members: int, number_of_properties: int,
 28                 value_size: int, data_format: str, engine: str):
 29        """Initialize a Raw Data scenario.
 30
 31        Parameters
 32        ----------
 33        main_directory : str
 34            Root directory for generating instances of Raw Data.
 35        verbose : bool
 36            Verbose logging enabled or not.
 37        number_of_members : int
 38            Number of members to generate, for example 5000 for 5K rows in a
 39            tabular data structure.
 40        number_of_properties : int
 41            Number of properties per member to generate, for example 20 for
 42            20 columns in a tabular data structure.
 43        value_size : int
 44            Number of characters to add to default value generation,
 45            for example: 256 will expand all values to 256 characters.
 46        data_format : str
 47            Data format to use for generating the data set, for example:
 48            "csv", "json", "xml", "postgresql", "mysql"
 49        engine : str
 50            Engine to use for execution of the generated scenario's instance,
 51            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 52            or "OntopMaterialize"
 53        """
 54        self._number_of_members: int = number_of_members
 55        self._number_of_properties: int = number_of_properties
 56        self._value_size: int = value_size
 57
 58        super().__init__(data_format, engine, main_directory, verbose)
 59        self._logger = Logger(__name__, self._main_directory, self._verbose)
 60
 61    def generate(self) -> bool:
 62        """Generate the instance using the Raw Data scenario.
 63
 64        Only CSV files are currently implemented!
 65        """
 66        if self._data_format == 'csv':
 67            return self._generate_csv()
 68        elif self._data_format == 'postgresql':
 69            return self._generate_postgresql()
 70        else:
 71            raise NotImplementedError(f'Data format {self._data_format} '
 72                                      f'is not implemented by {__name__}')
 73
 74    def path(self) -> str:
 75        """Builds the file path for the instance of a Raw Data scenario.
 76
 77        Returns
 78        -------
 79        path : str
 80            File path for the Raw Data's instance.
 81        """
 82        key = f'raw_{self._number_of_members}_' \
 83              f'{self._number_of_properties}_{self._value_size}'
 84        path = os.path.join(self._main_directory, self._engine,
 85                            self._data_format, key)
 86        self._logger.debug(f'Generating to {path}')
 87        os.makedirs(path, exist_ok=True)
 88        return path
 89
 90    def _generate_dataframe(self, member_offset: int = 1,
 91                            property_offset: int = 1) -> DataFrame:
 92        """Generate raw data.
 93
 94        Parameters
 95        ----------
 96        member_offset : int
 97            Offset to start member ID generation from. Default 1 (no offset).
 98        property_offset : int
 99            Offset to start property ID generation from. Default 1 (no offset).
100
101        Returns
102        -------
103        dataframe : DataFrame
104            Panda's DataFrame with generated raw data.
105        """
106        subject_id = range(member_offset,
107                           self._number_of_members + member_offset)
108        value_id = range(property_offset,
109                         self._number_of_members + property_offset)
110        data: dict = {'id': subject_id}
111        n_ascii = len(string.ascii_letters)
112
113        for j in range(1, self._number_of_properties + 1):
114            # Append ASCII characters if necessary, use modulo to avoid out of
115            # range in ASCII table
116            append_value = ''
117            if self._value_size > 0:
118                append_value = '_'
119            for n in range(self._value_size):
120                append_value += string.ascii_letters[n % n_ascii]
121
122            # Generate value V_{property}_{member} honoring the value size
123            value = [f'V_{j}-{i}{append_value}' for i in value_id]
124            data[f'p{j}'] = value
125
126        return DataFrame(data)
127
128    def _generate_mapping(self) -> Graph:
129        """Generate a [R2]RML mapping for a RawData instance.
130
131        Returns
132        -------
133        mapping : Graph
134            [R2]RML mapping as an RDFLib Graph.
135        """
136        mapping: Graph = Graph(base='http://ex.com/')
137        mapping.bind('rr', R2RML)
138        mapping.bind('rml', RML)
139        mapping.bind('ql', QL)
140        mapping.bind('ex', EX)
141        subject_template = Literal('http://ex.com/table/{id}')
142
143        if self._data_format == 'postgresql':
144            triples_map_iri = self._add_triples_map_table(mapping,
145                                                          subject_template,
146                                                          Literal('data'))
147        elif self._data_format == 'csv':
148            triples_map_iri = \
149                self._add_triples_map_source(mapping, subject_template,
150                                             Literal('/data/shared/data.csv'))
151        else:
152            raise NotImplementedError(f'{self._data_format} not implemented')
153
154        for i in range(1, self._number_of_properties + 1):
155            self._add_predicate_object_map(mapping, triples_map_iri,
156                                           EX[f'p{i}'], Literal(f'p{i}'))
157
158        return mapping
159
160    def _generate_csv(self) -> bool:
161        """Generate the instance as CSV files.
162
163        Returns
164        -------
165        success : bool
166            True if successfull, false otherwise
167        """
168        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
169        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
170        self._generate_dataframe().to_csv(data_path, index=False)
171
172        mapping_path = os.path.join(self.path(), 'data', 'shared',
173                                    CSV_MAPPING_FILE)
174        mapping: Graph = self._generate_mapping()
175        mapping.serialize(destination=mapping_path, format='turtle')
176        self._generate_scenario()
177
178        return True
179
180    def _generate_postgresql(self) -> bool:
181        """Generate the instance as PostgreSQL with CSV files to load.
182
183        Returns
184        -------
185        success : bool
186            True if successfull, false otherwise
187        """
188        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
189        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
190        self._generate_dataframe().to_csv(data_path, index=False)
191
192        mapping_path = os.path.join(self.path(), 'data', 'shared',
193                                    RDB_MAPPING_FILE)
194        mapping: Graph = self._generate_mapping()
195        mapping.serialize(destination=mapping_path, format='turtle')
196        self._generate_scenario()
197
198        return True
199
200    def _generate_scenario(self) -> bool:
201        """Generate the metadata for this scenario.
202
203        Configures the execution pipeline automatically.
204
205        Returns
206        -------
207        success : bool
208            True if successfull, false otherwise
209        """
210        name: str = f'raw_{self._number_of_members}_' + \
211                    f'{self._number_of_properties}_{self._value_size}'
212        description: str = f'Raw Data Values {self._number_of_members} ' + \
213                           f'members, {self._number_of_properties} ' + \
214                           f'properties, and {self._value_size} value size'
215        iri: str = f'http://example.org/raw/{self._number_of_members}/' + \
216                   f'{self._number_of_properties}/{self._value_size}'
217
218        if self._data_format == 'postgresql':
219            return self._generate_metadata(iri, name, description,
220                                           RDB_MAPPING_FILE)
221        elif self._data_format == 'csv':
222            return self._generate_metadata(iri, name, description,
223                                           CSV_MAPPING_FILE)
224        else:
225            raise NotImplementedError(f'{self._data_format} not implemented')
226
227        return False

Helper class that provides a standard way to create an ABC using inheritance.

RawData( main_directory: str, verbose: bool, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str)
26    def __init__(self, main_directory: str, verbose: bool,
27                 number_of_members: int, number_of_properties: int,
28                 value_size: int, data_format: str, engine: str):
29        """Initialize a Raw Data scenario.
30
31        Parameters
32        ----------
33        main_directory : str
34            Root directory for generating instances of Raw Data.
35        verbose : bool
36            Verbose logging enabled or not.
37        number_of_members : int
38            Number of members to generate, for example 5000 for 5K rows in a
39            tabular data structure.
40        number_of_properties : int
41            Number of properties per member to generate, for example 20 for
42            20 columns in a tabular data structure.
43        value_size : int
44            Number of characters to add to default value generation,
45            for example: 256 will expand all values to 256 characters.
46        data_format : str
47            Data format to use for generating the data set, for example:
48            "csv", "json", "xml", "postgresql", "mysql"
49        engine : str
50            Engine to use for execution of the generated scenario's instance,
51            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
52            or "OntopMaterialize"
53        """
54        self._number_of_members: int = number_of_members
55        self._number_of_properties: int = number_of_properties
56        self._value_size: int = value_size
57
58        super().__init__(data_format, engine, main_directory, verbose)
59        self._logger = Logger(__name__, self._main_directory, self._verbose)

Initialize a Raw Data scenario.

Parameters
  • main_directory (str): Root directory for generating instances of Raw Data.
  • verbose (bool): Verbose logging enabled or not.
  • number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
  • number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
  • value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
  • data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
  • engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
def generate(self) -> bool:
61    def generate(self) -> bool:
62        """Generate the instance using the Raw Data scenario.
63
64        Only CSV files are currently implemented!
65        """
66        if self._data_format == 'csv':
67            return self._generate_csv()
68        elif self._data_format == 'postgresql':
69            return self._generate_postgresql()
70        else:
71            raise NotImplementedError(f'Data format {self._data_format} '
72                                      f'is not implemented by {__name__}')

Generate the instance using the Raw Data scenario.

Only CSV files are currently implemented!

def path(self) -> str:
74    def path(self) -> str:
75        """Builds the file path for the instance of a Raw Data scenario.
76
77        Returns
78        -------
79        path : str
80            File path for the Raw Data's instance.
81        """
82        key = f'raw_{self._number_of_members}_' \
83              f'{self._number_of_properties}_{self._value_size}'
84        path = os.path.join(self._main_directory, self._engine,
85                            self._data_format, key)
86        self._logger.debug(f'Generating to {path}')
87        os.makedirs(path, exist_ok=True)
88        return path

Builds the file path for the instance of a Raw Data scenario.

Returns
  • path (str): File path for the Raw Data's instance.