Sync your data
How to sync data from external data sources.
After defining the content types, you can start syncing the data. However, the data must first be formatted as JSON Lines for Optimizely Graph to ingest it.
Note
This tutorial uses the non-commercial datasets of IMDb.
The following fields are required to be set, so you can find results when you query after syncing:
ContentType
– The values of Content Types.Status
– Field set to one of the following:Draft
– Not exposed or available with the public key.Published
– Available with the public key.
_rbac
– Uses role-based access to hide items. To make the data available for everyone, set the value tor:Everyone:Read
.__typename
– Needed to support inline fragments, where the value must be the implemented content type.
Note
__typename
is prefixed with two underscores.
Each item you send to Optimizely Graph needs to be prefaced with a line in JSON that sets the _id
(unique ID belonging to the item) and the language_routing
. For this tutorial, language_routing
is to "en"
because that is what was configured in the Content Types' languages
.
You can sync the data from the TSV files to Optimizely Graph (POST /api/content/v2/data
) using the following Python code:
#!/usr/bin/env python
import csv
import json
import collections
import requests as requests
OrderedDict = collections.OrderedDict
SOURCE = "imdb"
DATA_SYNC_ENDPOINT = "https://cg.optimizely.com/api/content/v2/data?id={}".format(SOURCE)
HEADERS = {
'Content-Type': 'text/plain',
'Authorization': 'Basic <Token>'
}
NAME_BASICS_FILE = 'data/name.basics.small.tsv'
TITLE_BASICS_FILE = 'data/title.basics.small.tsv'
TITLE_RATINGS_FILE = 'data/title.ratings.small.tsv'
STRING_ARRAY_FIELDS = ["ContentType", "knownForTitles", "primaryProfession___searchable", "genres___searchable"]
INT_FIELDS = ["birthYear", "deathYear", "startYear", "endYear", "runtimeMinutes", "numVotes"]
FLOAT_FIELDS = ["averageRating"]
BOOLEAN_FIELDS = ["isAdult"]
def load_data(source, content_type):
data = []
with open(source, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter='\t', quotechar='"')
header = next(reader)
header.append("ContentType")
header.append("Status")
header.append("_rbac")
header.append("__typename")
count = 0
idx = 0
for (is_last_check, row) in is_last(reader):
for i, value in enumerate(row):
if header[i] in STRING_ARRAY_FIELDS:
row[i] = value.split(",") if "," in value else [value]
elif header[i] in INT_FIELDS:
row[i] = int(value) if value != "\\N" else None
elif header[i] in FLOAT_FIELDS:
row[i] = float(value) if value != "\\N" else None
elif header[i] in BOOLEAN_FIELDS:
row[i] = value.lower() in ["1"]
row.append(["Record", content_type])
row.append("Published")
row.append("r:Everyone:Read")
row.append(content_type)
data.append(OrderedDict(zip(header, row)))
count += 1
idx += 1
if count == 100 or is_last_check:
count = 0
bulk = '\n'.join(
"{\"index\": { \"_id\": \"" + source + str(idx + i) + "\", \"language_routing\": \"en\" }}\n" + json.dumps(v) for (i, v) in
enumerate(data))
response = requests.request("POST", DATA_SYNC_ENDPOINT, headers=HEADERS, data=bulk)
print(response.text)
data = []
def is_last(itr):
old = next(itr)
for new in itr:
yield False, old
old = new
yield True, old
load_data(NAME_BASICS_FILE, "Actor")
load_data(TITLE_BASICS_FILE, "Title")
load_data(TITLE_RATINGS_FILE, "Rating")
A bulk request will look like the following:
{"index": { "_id": "data/name.basics.small.tsv1000", "language_routing": "en" }}
{"nconst": "nm0000001", "primaryName___searchable": "Fred Astaire", "birthYear": 1899, "deathYear": 1987, "primaryProfession___searchable": ["soundtrack", "actor", "miscellaneous"], "knownForTitles": ["tt0050419", "tt0031983", "tt0053137", "tt0072308"], "ContentType": ["Record", "Actor"], "Status": "Published", "_rbac": "r:Everyone:Read", "__typename": "Actor"}
{"index": { "_id": "data/name.basics.small.tsv1001", "language_routing": "en" }}
{"nconst": "nm0000002", "primaryName___searchable": "Lauren Bacall", "birthYear": 1924, "deathYear": 2014, "primaryProfession___searchable": ["actress", "soundtrack"], "knownForTitles": ["tt0075213", "tt0037382", "tt0117057", "tt0038355"], "ContentType": ["Record", "Actor"], "Status": "Published", "_rbac": "r:Everyone:Read", "__typename": "Actor"}
Updated about 1 month ago