bench_generator.mappings

This module holds the Mappings class which scales the dataset size by the number of members in a dataset such as number of rows for tabular data.

  1#!/usr/bin/env python3
  2
  3"""
  4This module holds the Mappings class which scales the dataset size
  5by the number of members in a dataset such as number of rows for tabular data.
  6"""
  7
  8import os
  9import string
 10from pandas import DataFrame
 11from rdflib.namespace import RDF
 12from rdflib import Graph, URIRef, BNode, Literal, Namespace
 13from bench_generator.scenario import Scenario
 14from bench_generator.logger import Logger
 15
 16DATA_FILE = 'data.csv'
 17CSV_MAPPING_FILE = 'mapping.rml.ttl'
 18RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
 19R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 20QL = Namespace('http://semweb.mmlab.be/ns/ql#')
 21EX = Namespace('http://example.com/')
 22
 23
 24class Mappings(Scenario):
 25    def __init__(self, main_directory: str, verbose: bool, number_of_tms: int,
 26                 number_of_poms: int, number_of_members: int,
 27                 number_of_properties: int, value_size: int, data_format: str,
 28                 engine: str):
 29        """Initialize a Mappings scenario.
 30
 31        Parameters
 32        ----------
 33        main_directory : str
 34            Root directory for generating instances of Mappings.
 35        verbose : bool
 36            Verbose logging enabled or not.
 37        number_of_members : int
 38            Number of members to generate, for example 5000 for 5K rows in a
 39            tabular data structure.
 40        number_of_properties : int
 41            Number of properties per member to generate, for example 20 for
 42            20 columns in a tabular data structure.
 43        value_size : int
 44            Number of characters to add to default value generation,
 45            for example: 256 will expand all values to 256 characters.
 46        data_format : str
 47            Data format to use for generating the data set, for example:
 48            "csv", "json", "xml", "postgresql", "mysql"
 49        engine : str
 50            Engine to use for execution of the generated scenario's instance,
 51            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 52            or "OntopMaterialize"
 53        """
 54        self._number_of_tms: int = number_of_tms
 55        self._number_of_poms: int = number_of_poms
 56        self._number_of_members: int = number_of_members
 57        self._number_of_properties: int = number_of_properties
 58        self._value_size: int = value_size
 59
 60        super().__init__(data_format, engine, main_directory, verbose)
 61        if self._data_format != 'csv':
 62            raise NotImplementedError(f'Data format {self._data_format} '
 63                                      f'is not implemented by {__name__}')
 64
 65        self._logger = Logger(__name__, self._main_directory, self._verbose)
 66
 67    def generate(self) -> bool:
 68        """Generate the instance using the Mappings scenario.
 69
 70        Only CSV files are currently implemented!
 71        """
 72        if self._data_format == 'csv':
 73            return self._generate_csv()
 74        elif self._data_format == 'postgresql':
 75            return self._generate_postgresql()
 76        else:
 77            raise NotImplementedError(f'Data format {self._data_format} '
 78                                      f'is not implemented by {__name__}')
 79
 80    def path(self) -> str:
 81        """Builds the file path for the instance of a Mappings scenario.
 82
 83        Returns
 84        -------
 85        path : str
 86            File path for the Mappings's instance.
 87        """
 88        key = f'mappings_{self._number_of_tms}_' \
 89              f'{self._number_of_poms}'
 90        path = os.path.join(self._main_directory, self._engine,
 91                            self._data_format, key)
 92        self._logger.debug(f'Generating to {path}')
 93        os.makedirs(path, exist_ok=True)
 94        return path
 95
 96    def _generate_dataframe(self, member_offset: int = 1,
 97                            property_offset: int = 1) -> DataFrame:
 98        """Generate mappings.
 99
100        Parameters
101        ----------
102        member_offset : int
103            Offset to start member ID generation from. Default 1 (no offset).
104        property_offset : int
105            Offset to start property ID generation from. Default 1 (no offset).
106
107        Returns
108        -------
109        dataframe : DataFrame
110            Panda's DataFrame with generated mappings.
111        """
112        subject_id = range(member_offset,
113                           self._number_of_members + member_offset)
114        value_id = range(property_offset,
115                         self._number_of_members + property_offset)
116        data: dict = {'id': subject_id}
117        n_ascii = len(string.ascii_letters)
118
119        for j in range(1, self._number_of_properties + 1):
120            # Append ASCII characters if necessary, use modulo to avoid out of
121            # range in ASCII table
122            append_value = ''
123            if self._value_size > 0:
124                append_value = '_'
125            for n in range(self._value_size):
126                append_value += string.ascii_letters[n % n_ascii]
127
128            # Generate value V_{property}_{member} honoring the value size
129            value = [f'V_{j}-{i}{append_value}' for i in value_id]
130            data[f'p{j}'] = value
131
132        return DataFrame(data)
133
134    def _generate_mapping(self) -> Graph:
135        """Generate a [R2]RML mapping for a Mappings instance.
136
137        Returns
138        -------
139        mapping : Graph
140            [R2]RML mapping as an RDFLib Graph.
141        """
142        mapping: Graph = Graph(base='http://ex.com/')
143        mapping.bind('rr', R2RML)
144        mapping.bind('ql', QL)
145        mapping.bind('ex', EX)
146
147        for i in range(1, self._number_of_tms + 1):
148            subject_template = Literal(f'http://ex.com/table/{{p{i}}}')
149            if self._data_format == 'postgresql':
150                triples_map_iri = self._add_triples_map_table(mapping,
151                                                              subject_template,
152                                                              Literal('data'),
153                                                              number=i)
154            elif self._data_format == 'csv':
155                csv_path = Literal('/data/shared/data.csv')
156                triples_map_iri = \
157                    self._add_triples_map_source(mapping, subject_template,
158                                                 csv_path, number=i)
159            else:
160                msg = f'{self._data_format} not implemented'
161                raise NotImplementedError(msg)
162
163            for j in range(1, self._number_of_poms + 1):
164                self._add_predicate_object_map(mapping, triples_map_iri,
165                                               EX[f'p{j}'], Literal(f'p{j}'))
166
167        return mapping
168
169    def _generate_csv(self) -> bool:
170        """Generate the instance as CSV files.
171
172        Returns
173        -------
174        success : bool
175            True if successfull, false otherwise
176        """
177        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
178        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
179        self._generate_dataframe().to_csv(data_path, index=False)
180
181        mapping_path = os.path.join(self.path(), 'data', 'shared',
182                                    CSV_MAPPING_FILE)
183        mapping: Graph = self._generate_mapping()
184        mapping.serialize(destination=mapping_path, format='turtle')
185        self._generate_scenario()
186
187        return True
188
189    def _generate_postgresql(self) -> bool:
190        """Generate the instance as PostgreSQL with CSV files to load.
191
192        Returns
193        -------
194        success : bool
195            True if successfull, false otherwise
196        """
197        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
198        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
199        self._generate_dataframe().to_csv(data_path, index=False)
200
201        mapping_path = os.path.join(self.path(), 'data', 'shared',
202                                    RDB_MAPPING_FILE)
203        mapping: Graph = self._generate_mapping()
204        mapping.serialize(destination=mapping_path, format='turtle')
205        self._generate_scenario()
206
207        return True
208
209    def _generate_scenario(self) -> bool:
210        """Generate the metadata for this scenario.
211
212        Configures the execution pipeline automatically.
213
214        Returns
215        -------
216        success : bool
217            True if successfull, false otherwise
218        """
219        name: str = f'mappings_{self._number_of_tms}_{self._number_of_poms}'
220        description: str = f'Mappings {self._number_of_tms}TM + ' + \
221                           f'{self._number_of_poms}POMs'
222        iri: str = f'http://example.org/mappings/{self._number_of_tms}/' + \
223                   f'{self._number_of_poms}'
224
225        if self._data_format == 'postgresql':
226            return self._generate_metadata(iri, name, description,
227                                           RDB_MAPPING_FILE)
228        elif self._data_format == 'csv':
229            return self._generate_metadata(iri, name, description,
230                                           CSV_MAPPING_FILE)
231        else:
232            raise NotImplementedError(f'{self._data_format} not implemented')
233
234        return False
DATA_FILE = 'data.csv'
CSV_MAPPING_FILE = 'mapping.rml.ttl'
RDB_MAPPING_FILE = 'mapping.r2rml.ttl'
R2RML = Namespace('http://www.w3.org/ns/r2rml#')
QL = Namespace('http://semweb.mmlab.be/ns/ql#')
EX = Namespace('http://example.com/')
class Mappings(bench_generator.scenario.Scenario):
 25class Mappings(Scenario):
 26    def __init__(self, main_directory: str, verbose: bool, number_of_tms: int,
 27                 number_of_poms: int, number_of_members: int,
 28                 number_of_properties: int, value_size: int, data_format: str,
 29                 engine: str):
 30        """Initialize a Mappings scenario.
 31
 32        Parameters
 33        ----------
 34        main_directory : str
 35            Root directory for generating instances of Mappings.
 36        verbose : bool
 37            Verbose logging enabled or not.
 38        number_of_members : int
 39            Number of members to generate, for example 5000 for 5K rows in a
 40            tabular data structure.
 41        number_of_properties : int
 42            Number of properties per member to generate, for example 20 for
 43            20 columns in a tabular data structure.
 44        value_size : int
 45            Number of characters to add to default value generation,
 46            for example: 256 will expand all values to 256 characters.
 47        data_format : str
 48            Data format to use for generating the data set, for example:
 49            "csv", "json", "xml", "postgresql", "mysql"
 50        engine : str
 51            Engine to use for execution of the generated scenario's instance,
 52            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
 53            or "OntopMaterialize"
 54        """
 55        self._number_of_tms: int = number_of_tms
 56        self._number_of_poms: int = number_of_poms
 57        self._number_of_members: int = number_of_members
 58        self._number_of_properties: int = number_of_properties
 59        self._value_size: int = value_size
 60
 61        super().__init__(data_format, engine, main_directory, verbose)
 62        if self._data_format != 'csv':
 63            raise NotImplementedError(f'Data format {self._data_format} '
 64                                      f'is not implemented by {__name__}')
 65
 66        self._logger = Logger(__name__, self._main_directory, self._verbose)
 67
 68    def generate(self) -> bool:
 69        """Generate the instance using the Mappings scenario.
 70
 71        Only CSV files are currently implemented!
 72        """
 73        if self._data_format == 'csv':
 74            return self._generate_csv()
 75        elif self._data_format == 'postgresql':
 76            return self._generate_postgresql()
 77        else:
 78            raise NotImplementedError(f'Data format {self._data_format} '
 79                                      f'is not implemented by {__name__}')
 80
 81    def path(self) -> str:
 82        """Builds the file path for the instance of a Mappings scenario.
 83
 84        Returns
 85        -------
 86        path : str
 87            File path for the Mappings's instance.
 88        """
 89        key = f'mappings_{self._number_of_tms}_' \
 90              f'{self._number_of_poms}'
 91        path = os.path.join(self._main_directory, self._engine,
 92                            self._data_format, key)
 93        self._logger.debug(f'Generating to {path}')
 94        os.makedirs(path, exist_ok=True)
 95        return path
 96
 97    def _generate_dataframe(self, member_offset: int = 1,
 98                            property_offset: int = 1) -> DataFrame:
 99        """Generate mappings.
100
101        Parameters
102        ----------
103        member_offset : int
104            Offset to start member ID generation from. Default 1 (no offset).
105        property_offset : int
106            Offset to start property ID generation from. Default 1 (no offset).
107
108        Returns
109        -------
110        dataframe : DataFrame
111            Panda's DataFrame with generated mappings.
112        """
113        subject_id = range(member_offset,
114                           self._number_of_members + member_offset)
115        value_id = range(property_offset,
116                         self._number_of_members + property_offset)
117        data: dict = {'id': subject_id}
118        n_ascii = len(string.ascii_letters)
119
120        for j in range(1, self._number_of_properties + 1):
121            # Append ASCII characters if necessary, use modulo to avoid out of
122            # range in ASCII table
123            append_value = ''
124            if self._value_size > 0:
125                append_value = '_'
126            for n in range(self._value_size):
127                append_value += string.ascii_letters[n % n_ascii]
128
129            # Generate value V_{property}_{member} honoring the value size
130            value = [f'V_{j}-{i}{append_value}' for i in value_id]
131            data[f'p{j}'] = value
132
133        return DataFrame(data)
134
135    def _generate_mapping(self) -> Graph:
136        """Generate a [R2]RML mapping for a Mappings instance.
137
138        Returns
139        -------
140        mapping : Graph
141            [R2]RML mapping as an RDFLib Graph.
142        """
143        mapping: Graph = Graph(base='http://ex.com/')
144        mapping.bind('rr', R2RML)
145        mapping.bind('ql', QL)
146        mapping.bind('ex', EX)
147
148        for i in range(1, self._number_of_tms + 1):
149            subject_template = Literal(f'http://ex.com/table/{{p{i}}}')
150            if self._data_format == 'postgresql':
151                triples_map_iri = self._add_triples_map_table(mapping,
152                                                              subject_template,
153                                                              Literal('data'),
154                                                              number=i)
155            elif self._data_format == 'csv':
156                csv_path = Literal('/data/shared/data.csv')
157                triples_map_iri = \
158                    self._add_triples_map_source(mapping, subject_template,
159                                                 csv_path, number=i)
160            else:
161                msg = f'{self._data_format} not implemented'
162                raise NotImplementedError(msg)
163
164            for j in range(1, self._number_of_poms + 1):
165                self._add_predicate_object_map(mapping, triples_map_iri,
166                                               EX[f'p{j}'], Literal(f'p{j}'))
167
168        return mapping
169
170    def _generate_csv(self) -> bool:
171        """Generate the instance as CSV files.
172
173        Returns
174        -------
175        success : bool
176            True if successfull, false otherwise
177        """
178        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
179        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
180        self._generate_dataframe().to_csv(data_path, index=False)
181
182        mapping_path = os.path.join(self.path(), 'data', 'shared',
183                                    CSV_MAPPING_FILE)
184        mapping: Graph = self._generate_mapping()
185        mapping.serialize(destination=mapping_path, format='turtle')
186        self._generate_scenario()
187
188        return True
189
190    def _generate_postgresql(self) -> bool:
191        """Generate the instance as PostgreSQL with CSV files to load.
192
193        Returns
194        -------
195        success : bool
196            True if successfull, false otherwise
197        """
198        os.makedirs(os.path.join(self.path(), 'data', 'shared'), exist_ok=True)
199        data_path = os.path.join(self.path(), 'data', 'shared', DATA_FILE)
200        self._generate_dataframe().to_csv(data_path, index=False)
201
202        mapping_path = os.path.join(self.path(), 'data', 'shared',
203                                    RDB_MAPPING_FILE)
204        mapping: Graph = self._generate_mapping()
205        mapping.serialize(destination=mapping_path, format='turtle')
206        self._generate_scenario()
207
208        return True
209
210    def _generate_scenario(self) -> bool:
211        """Generate the metadata for this scenario.
212
213        Configures the execution pipeline automatically.
214
215        Returns
216        -------
217        success : bool
218            True if successfull, false otherwise
219        """
220        name: str = f'mappings_{self._number_of_tms}_{self._number_of_poms}'
221        description: str = f'Mappings {self._number_of_tms}TM + ' + \
222                           f'{self._number_of_poms}POMs'
223        iri: str = f'http://example.org/mappings/{self._number_of_tms}/' + \
224                   f'{self._number_of_poms}'
225
226        if self._data_format == 'postgresql':
227            return self._generate_metadata(iri, name, description,
228                                           RDB_MAPPING_FILE)
229        elif self._data_format == 'csv':
230            return self._generate_metadata(iri, name, description,
231                                           CSV_MAPPING_FILE)
232        else:
233            raise NotImplementedError(f'{self._data_format} not implemented')
234
235        return False

Helper class that provides a standard way to create an ABC using inheritance.

Mappings( main_directory: str, verbose: bool, number_of_tms: int, number_of_poms: int, number_of_members: int, number_of_properties: int, value_size: int, data_format: str, engine: str)
26    def __init__(self, main_directory: str, verbose: bool, number_of_tms: int,
27                 number_of_poms: int, number_of_members: int,
28                 number_of_properties: int, value_size: int, data_format: str,
29                 engine: str):
30        """Initialize a Mappings scenario.
31
32        Parameters
33        ----------
34        main_directory : str
35            Root directory for generating instances of Mappings.
36        verbose : bool
37            Verbose logging enabled or not.
38        number_of_members : int
39            Number of members to generate, for example 5000 for 5K rows in a
40            tabular data structure.
41        number_of_properties : int
42            Number of properties per member to generate, for example 20 for
43            20 columns in a tabular data structure.
44        value_size : int
45            Number of characters to add to default value generation,
46            for example: 256 will expand all values to 256 characters.
47        data_format : str
48            Data format to use for generating the data set, for example:
49            "csv", "json", "xml", "postgresql", "mysql"
50        engine : str
51            Engine to use for execution of the generated scenario's instance,
52            for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC",
53            or "OntopMaterialize"
54        """
55        self._number_of_tms: int = number_of_tms
56        self._number_of_poms: int = number_of_poms
57        self._number_of_members: int = number_of_members
58        self._number_of_properties: int = number_of_properties
59        self._value_size: int = value_size
60
61        super().__init__(data_format, engine, main_directory, verbose)
62        if self._data_format != 'csv':
63            raise NotImplementedError(f'Data format {self._data_format} '
64                                      f'is not implemented by {__name__}')
65
66        self._logger = Logger(__name__, self._main_directory, self._verbose)

Initialize a Mappings scenario.

Parameters
  • main_directory (str): Root directory for generating instances of Mappings.
  • verbose (bool): Verbose logging enabled or not.
  • number_of_members (int): Number of members to generate, for example 5000 for 5K rows in a tabular data structure.
  • number_of_properties (int): Number of properties per member to generate, for example 20 for 20 columns in a tabular data structure.
  • value_size (int): Number of characters to add to default value generation, for example: 256 will expand all values to 256 characters.
  • data_format (str): Data format to use for generating the data set, for example: "csv", "json", "xml", "postgresql", "mysql"
  • engine (str): Engine to use for execution of the generated scenario's instance, for example: "RMLMapper", "RMLStreamer", "SDMRDFizer", "MorphKGC", or "OntopMaterialize"
def generate(self) -> bool:
68    def generate(self) -> bool:
69        """Generate the instance using the Mappings scenario.
70
71        Only CSV files are currently implemented!
72        """
73        if self._data_format == 'csv':
74            return self._generate_csv()
75        elif self._data_format == 'postgresql':
76            return self._generate_postgresql()
77        else:
78            raise NotImplementedError(f'Data format {self._data_format} '
79                                      f'is not implemented by {__name__}')

Generate the instance using the Mappings scenario.

Only CSV files are currently implemented!

def path(self) -> str:
81    def path(self) -> str:
82        """Builds the file path for the instance of a Mappings scenario.
83
84        Returns
85        -------
86        path : str
87            File path for the Mappings's instance.
88        """
89        key = f'mappings_{self._number_of_tms}_' \
90              f'{self._number_of_poms}'
91        path = os.path.join(self._main_directory, self._engine,
92                            self._data_format, key)
93        self._logger.debug(f'Generating to {path}')
94        os.makedirs(path, exist_ok=True)
95        return path

Builds the file path for the instance of a Mappings scenario.

Returns
  • path (str): File path for the Mappings's instance.