Synchronize your data
How to synchronize data from external data sources.
After Defining the content types, you can start synchronizing the data. However, the data needs to be prepared using the jsonl format so Optimizely Graph can ingest it. Read on to learn how you can transform TSV data into JSON using Python.
Note
This tutorial uses the non-commercial datasets of IMDb.
The following fields are required to be set, so you can find results when you query after synchronization.
ContentType
– The values of Content Types.Status
– Set this field to one of the following values:Draft
– Not exposed or available with the public key.Published
– Available with the public key.
_rbac
– Allows items to be hidden using roles-based access. To make the data available for everyone, set the value tor:Everyone:Read
.__typename
– Needed to support inline fragments, where the value must be the implemented content type.
Note
__typename
is prefixed with two underscores.
Each item you send to Optimizely Graph needs to be prefaced with a line with JSON that sets the _id
(unique ID belonging to the item) and the language_routing
, which we set to "en" because this is what we configured in the Content Types' languages
.
We can easily synchronize the data from the TSV files to Optimizely Graph (POST /api/content/v2/data
) with the following Python code:
#!/usr/bin/env python
import csv
import json
import collections
import requests as requests
OrderedDict = collections.OrderedDict
SOURCE = "imdb"
DATA_SYNC_ENDPOINT = "https://cg.optimizely.com/api/content/v2/data?id={}".format(SOURCE)
HEADERS = {
'Content-Type': 'text/plain',
'Authorization': 'Basic <Token>'
}
NAME_BASICS_FILE = 'data/name.basics.small.tsv'
TITLE_BASICS_FILE = 'data/title.basics.small.tsv'
TITLE_RATINGS_FILE = 'data/title.ratings.small.tsv'
STRING_ARRAY_FIELDS = ["ContentType", "knownForTitles", "primaryProfession___searchable", "genres___searchable"]
INT_FIELDS = ["birthYear", "deathYear", "startYear", "endYear", "runtimeMinutes", "numVotes"]
FLOAT_FIELDS = ["averageRating"]
BOOLEAN_FIELDS = ["isAdult"]
def load_data(source, content_type):
data = []
with open(source, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter='\t', quotechar='"')
header = next(reader)
header.append("ContentType")
header.append("Status")
header.append("_rbac")
header.append("__typename")
count = 0
idx = 0
for (is_last_check, row) in is_last(reader):
for i, value in enumerate(row):
if header[i] in STRING_ARRAY_FIELDS:
row[i] = value.split(",") if "," in value else [value]
elif header[i] in INT_FIELDS:
row[i] = int(value) if value != "\\N" else None
elif header[i] in FLOAT_FIELDS:
row[i] = float(value) if value != "\\N" else None
elif header[i] in BOOLEAN_FIELDS:
row[i] = value.lower() in ["1"]
row.append(["Record", content_type])
row.append("Published")
row.append("r:Everyone:Read")
row.append(content_type)
data.append(OrderedDict(zip(header, row)))
count += 1
idx += 1
if count == 100 or is_last_check:
count = 0
bulk = '\n'.join(
"{\"index\": { \"_id\": \"" + source + str(idx + i) + "\", \"language_routing\": \"en\" }}\n" + json.dumps(v) for (i, v) in
enumerate(data))
response = requests.request("POST", DATA_SYNC_ENDPOINT, headers=HEADERS, data=bulk)
print(response.text)
data = []
def is_last(itr):
old = next(itr)
for new in itr:
yield False, old
old = new
yield True, old
load_data(NAME_BASICS_FILE, "Actor")
load_data(TITLE_BASICS_FILE, "Title")
load_data(TITLE_RATINGS_FILE, "Rating")
A bulk request will look like this:
{"index": { "_id": "data/name.basics.small.tsv1000", "language_routing": "en" }}
{"nconst": "nm0000001", "primaryName___searchable": "Fred Astaire", "birthYear": 1899, "deathYear": 1987, "primaryProfession___searchable": ["soundtrack", "actor", "miscellaneous"], "knownForTitles": ["tt0050419", "tt0031983", "tt0053137", "tt0072308"], "ContentType": ["Record", "Actor"], "Status": "Published", "_rbac": "r:Everyone:Read", "__typename": "Actor"}
{"index": { "_id": "data/name.basics.small.tsv1001", "language_routing": "en" }}
{"nconst": "nm0000002", "primaryName___searchable": "Lauren Bacall", "birthYear": 1924, "deathYear": 2014, "primaryProfession___searchable": ["actress", "soundtrack"], "knownForTitles": ["tt0075213", "tt0037382", "tt0117057", "tt0038355"], "ContentType": ["Record", "Actor"], "Status": "Published", "_rbac": "r:Everyone:Read", "__typename": "Actor"}
Updated 6 months ago