How to use jupyter nbconvert
最近在使用jupyter notebook的时候,发现notebook文件在问题探索方面非常方便,但是交付的话,还是期望能将其转换为python源文件。要实现notebook源文件(.ipynb)与python源文件(.py)之间的相互转换,可以使用命令jupyter nbconvert
来完成。举例如下,
这里有一个文件名称为,内容如下:
lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ cat Chap14.ipynb
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyalink.alink import *\n",
"useLocalEnv(1)\n",
"\n",
"from utils import *\n",
"import os\n",
"import pandas as pd\n",
"\n",
"pd.set_option('display.max_colwidth', 1000)\n",
"\n",
"DATA_DIR = ROOT_DIR + \"ctr_avazu\" + os.sep\n",
"\n",
"SCHEMA_STRING\\\n",
" = \"id string, click string, dt string, C1 string, banner_pos int, site_id string, site_domain string, \"\\\n",
" + \"site_category string, app_id string, app_domain string, app_category string, device_id string, \"\\\n",
" + \"device_ip string, device_model string, device_type string, device_conn_type string, C14 int, C15 int, \"\\\n",
" + \"C16 int, C17 int, C18 int, C19 int, C20 int, C21 int\"\n",
"\n",
"CATEGORY_COL_NAMES = [\n",
" \"C1\", \"banner_pos\", \"site_category\", \"app_domain\",\n",
" \"app_category\", \"device_type\", \"device_conn_type\",\n",
" \"site_id\", \"site_domain\", \"device_id\", \"device_model\"\n",
"]\n",
"\n",
"NUMERICAL_COL_NAMES = [\"C14\", \"C15\", \"C16\", \"C17\", \"C18\", \"C19\", \"C20\", \"C21\"]\n",
"\n",
"FEATURE_MODEL_FILE = \"feature_model.ak\"\n",
"INIT_MODEL_FILE = \"init_model.ak\"\n",
"\n",
"LABEL_COL_NAME = \"click\"\n",
"VEC_COL_NAME = \"vec\"\n",
"PREDICTION_COL_NAME = \"pred\"\n",
"PRED_DETAIL_COL_NAME = \"pred_info\"\n",
"\n",
"NUM_HASH_FEATURES = 30000\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#c_2\n",
"TextSourceBatchOp()\\\n",
" .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n",
" + \"data-files/avazu-small.csv\")\\\n",
" .firstN(10)\\\n",
" .print()\n",
"\n",
"trainBatchData = CsvSourceBatchOp()\\\n",
" .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n",
" + \"data-files/avazu-small.csv\")\\\n",
" .setSchemaStr(SCHEMA_STRING);\n",
"\n",
"trainBatchData.firstN(10).print();\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#c_3\n",
"trainBatchData = CsvSourceBatchOp()\\\n",
" .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n",
" + \"data-files/avazu-small.csv\")\\\n",
" .setSchemaStr(SCHEMA_STRING);\n",
"\n",
"feature_pipeline = Pipeline()\\\n",
" .add(\n",
" StandardScaler()\\\n",
" .setSelectedCols(NUMERICAL_COL_NAMES)\n",
" )\\\n",
" .add(\n",
" FeatureHasher()\\\n",
" .setSelectedCols(CATEGORY_COL_NAMES + NUMERICAL_COL_NAMES)\\\n",
" .setCategoricalCols(CATEGORY_COL_NAMES)\\\n",
" .setOutputCol(VEC_COL_NAME)\\\n",
" .setNumFeatures(NUM_HASH_FEATURES)\n",
" );\n",
"\n",
"if not(os.path.exists(DATA_DIR + FEATURE_MODEL_FILE)) :\n",
" feature_pipeline\\\n",
" .fit(trainBatchData)\\\n",
" .save(DATA_DIR + FEATURE_MODEL_FILE)\n",
" BatchOperator.execute()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#c_4\n",
"feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE)\n",
"\n",
"data = CsvSourceStreamOp()\\\n",
" .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n",
" + \"data-files/avazu-ctr-train-8M.csv\")\\\n",
" .setSchemaStr(SCHEMA_STRING);\n",
"\n",
"if not(os.path.exists(DATA_DIR + INIT_MODEL_FILE)) :\n",
" trainBatchData = CsvSourceBatchOp()\\\n",
" .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n",
" + \"data-files/avazu-small.csv\")\\\n",
" .setSchemaStr(SCHEMA_STRING);\n",
"\n",
" lr = LogisticRegressionTrainBatchOp()\\\n",
" .setVectorCol(VEC_COL_NAME)\\\n",
" .setLabelCol(LABEL_COL_NAME)\\\n",
" .setWithIntercept(True)\\\n",
" .setMaxIter(10);\n",
"\n",
" feature_pipelineModel\\\n",
" .transform(trainBatchData)\\\n",
" .link(lr)\\\n",
" .link(\n",
" AkSinkBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE)\n",
" );\n",
" BatchOperator.execute();\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#c_5 \n",
"feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);\n",
"\n",
"initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);\n",
"\n",
"data = CsvSourceStreamOp()\\\n",
" .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n",
" + \"data-files/avazu-ctr-train-8M.csv\")\\\n",
" .setSchemaStr(SCHEMA_STRING)\\\n",
" .setIgnoreFirstLine(True)\n",
"\n",
"spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);\n",
"train_stream_data = feature_pipelineModel.transform(spliter);\n",
"test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));\n",
"\n",
"model = FtrlTrainStreamOp(initModel)\\\n",
" .setVectorCol(VEC_COL_NAME)\\\n",
" .setLabelCol(LABEL_COL_NAME)\\\n",
" .setWithIntercept(True)\\\n",
" .setAlpha(0.1)\\\n",
" .setBeta(0.1)\\\n",
" .setL1(0.01)\\\n",
" .setL2(0.01)\\\n",
" .setTimeInterval(10)\\\n",
" .setVectorSize(NUM_HASH_FEATURES)\\\n",
" .linkFrom(train_stream_data);\n",
"\n",
"predResult = FtrlPredictStreamOp(initModel)\\\n",
" .setVectorCol(VEC_COL_NAME)\\\n",
" .setPredictionCol(PREDICTION_COL_NAME)\\\n",
" .setReservedCols([LABEL_COL_NAME])\\\n",
" .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
" .linkFrom(model, test_stream_data);\n",
"\n",
"# predResult\\\n",
"# .sample(0.0001)\\\n",
"# .select(\"'Pred Sample' AS out_type, *\")\\\n",
"# .print();\n",
"\n",
"predResult.print(key=\"predResult\", refreshInterval = 30, maxLimit=20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"predResult\\\n",
" .link(\n",
" EvalBinaryClassStreamOp()\\\n",
" .setLabelCol(LABEL_COL_NAME)\\\n",
" .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
" .setTimeInterval(10)\n",
" )\\\n",
" .link(\n",
" JsonValueStreamOp()\\\n",
" .setSelectedCol(\"Data\")\\\n",
" .setReservedCols([\"Statistics\"])\\\n",
" .setOutputCols([\"Accuracy\", \"AUC\", \"ConfusionMatrix\"])\\\n",
" .setJsonPath([\"$.Accuracy\", \"$.AUC\", \"$.ConfusionMatrix\"])\n",
" )\\\n",
" .print(key=\"evaluation\", refreshInterval = 30, maxLimit=20)\n",
"# .select(\"'Eval Metric' AS out_type, *\")\\\n",
"# .print();\n",
"\n",
"StreamOperator.execute();\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#c_6\n",
"data = CsvSourceStreamOp()\\\n",
" .setFilePath(\"http://alink-release.oss-cn-beijing.aliyuncs.com/\"\n",
" + \"data-files/avazu-ctr-train-8M.csv\")\\\n",
" .setSchemaStr(SCHEMA_STRING)\\\n",
" .setIgnoreFirstLine(True);\n",
"\n",
"feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);\n",
"\n",
"spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);\n",
"train_stream_data = feature_pipelineModel.transform(spliter);\n",
"test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));\n",
"\n",
"initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);\n",
"\n",
"model = FtrlTrainStreamOp(initModel)\\\n",
" .setVectorCol(VEC_COL_NAME)\\\n",
" .setLabelCol(LABEL_COL_NAME)\\\n",
" .setWithIntercept(True)\\\n",
" .setAlpha(0.1)\\\n",
" .setBeta(0.1)\\\n",
" .setL1(0.01)\\\n",
" .setL2(0.01)\\\n",
" .setTimeInterval(10)\\\n",
" .setVectorSize(NUM_HASH_FEATURES)\\\n",
" .linkFrom(train_stream_data);\n",
"\n",
"model_filter = FtrlModelFilterStreamOp()\\\n",
" .setPositiveLabelValueString(\"1\")\\\n",
" .setVectorCol(VEC_COL_NAME)\\\n",
" .setLabelCol(LABEL_COL_NAME)\\\n",
" .setAccuracyThreshold(0.83)\\\n",
" .setAucThreshold(0.71)\\\n",
" .linkFrom(model, train_stream_data);\n",
"\n",
"model_filter\\\n",
" .select(\"'Model' AS out_type, *\")\\\n",
" .print();\n",
"\n",
"predResult = FtrlPredictStreamOp(initModel)\\\n",
" .setVectorCol(VEC_COL_NAME)\\\n",
" .setPredictionCol(PREDICTION_COL_NAME)\\\n",
" .setReservedCols([LABEL_COL_NAME])\\\n",
" .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
" .linkFrom(model_filter, test_stream_data);\n",
"\n",
"predResult\\\n",
" .sample(0.0001)\\\n",
" .select(\"'Pred Sample' AS out_type, *\")\\\n",
" .print();\n",
"\n",
"predResult\\\n",
" .link(\n",
" EvalBinaryClassStreamOp()\\\n",
" .setPositiveLabelValueString(\"1\")\\\n",
" .setLabelCol(LABEL_COL_NAME)\\\n",
" .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
" .setTimeInterval(10)\n",
" )\\\n",
" .link(\n",
" JsonValueStreamOp()\\\n",
" .setSelectedCol(\"Data\")\\\n",
" .setReservedCols([\"Statistics\"])\\\n",
" .setOutputCols([\"Accuracy\", \"AUC\", \"ConfusionMatrix\"])\\\n",
" .setJsonPath([\"$.Accuracy\", \"$.AUC\", \"$.ConfusionMatrix\"])\n",
" )\\\n",
" .select(\"'Eval Metric' AS out_type, *\")\\\n",
" .print();\n",
"\n",
"StreamOperator.execute();\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$
接下来,我们借助命令jupyter nbconvert
将其转换为.py文件,命令如下:
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ mkdir -p python
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ jupyter nbconvert --to python Chap14.ipynb --output-dir python
[NbConvertApp] Converting notebook Chap14.ipynb to python
[NbConvertApp] Writing 7347 bytes to python/Chap14.py
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$
我们看到,已经在python目录下生成了文件Chap14.py。
接下来,我们看一下生成的Chap14.py文件的内容:
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ cat python/Chap14.py
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
from pyalink.alink import *
useLocalEnv(1)
from utils import *
import os
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
DATA_DIR = ROOT_DIR + "ctr_avazu" + os.sep
SCHEMA_STRING\
= "id string, click string, dt string, C1 string, banner_pos int, site_id string, site_domain string, "\
+ "site_category string, app_id string, app_domain string, app_category string, device_id string, "\
+ "device_ip string, device_model string, device_type string, device_conn_type string, C14 int, C15 int, "\
+ "C16 int, C17 int, C18 int, C19 int, C20 int, C21 int"
CATEGORY_COL_NAMES = [
"C1", "banner_pos", "site_category", "app_domain",
"app_category", "device_type", "device_conn_type",
"site_id", "site_domain", "device_id", "device_model"
]
NUMERICAL_COL_NAMES = ["C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21"]
FEATURE_MODEL_FILE = "feature_model.ak"
INIT_MODEL_FILE = "init_model.ak"
LABEL_COL_NAME = "click"
VEC_COL_NAME = "vec"
PREDICTION_COL_NAME = "pred"
PRED_DETAIL_COL_NAME = "pred_info"
NUM_HASH_FEATURES = 30000
# In[ ]:
#c_2
TextSourceBatchOp()\
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-small.csv")\
.firstN(10)\
.print()
trainBatchData = CsvSourceBatchOp()\
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-small.csv")\
.setSchemaStr(SCHEMA_STRING);
trainBatchData.firstN(10).print();
# In[ ]:
#c_3
trainBatchData = CsvSourceBatchOp()\
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-small.csv")\
.setSchemaStr(SCHEMA_STRING);
feature_pipeline = Pipeline()\
.add(
StandardScaler()\
.setSelectedCols(NUMERICAL_COL_NAMES)
)\
.add(
FeatureHasher()\
.setSelectedCols(CATEGORY_COL_NAMES + NUMERICAL_COL_NAMES)\
.setCategoricalCols(CATEGORY_COL_NAMES)\
.setOutputCol(VEC_COL_NAME)\
.setNumFeatures(NUM_HASH_FEATURES)
);
if not(os.path.exists(DATA_DIR + FEATURE_MODEL_FILE)) :
feature_pipeline\
.fit(trainBatchData)\
.save(DATA_DIR + FEATURE_MODEL_FILE)
BatchOperator.execute()
# In[ ]:
#c_4
feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE)
data = CsvSourceStreamOp()\
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-ctr-train-8M.csv")\
.setSchemaStr(SCHEMA_STRING);
if not(os.path.exists(DATA_DIR + INIT_MODEL_FILE)) :
trainBatchData = CsvSourceBatchOp()\
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-small.csv")\
.setSchemaStr(SCHEMA_STRING);
lr = LogisticRegressionTrainBatchOp()\
.setVectorCol(VEC_COL_NAME)\
.setLabelCol(LABEL_COL_NAME)\
.setWithIntercept(True)\
.setMaxIter(10);
feature_pipelineModel\
.transform(trainBatchData)\
.link(lr)\
.link(
AkSinkBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE)
);
BatchOperator.execute();
# In[ ]:
#c_5
feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);
initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);
data = CsvSourceStreamOp()\
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-ctr-train-8M.csv")\
.setSchemaStr(SCHEMA_STRING)\
.setIgnoreFirstLine(True)
spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);
train_stream_data = feature_pipelineModel.transform(spliter);
test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));
model = FtrlTrainStreamOp(initModel)\
.setVectorCol(VEC_COL_NAME)\
.setLabelCol(LABEL_COL_NAME)\
.setWithIntercept(True)\
.setAlpha(0.1)\
.setBeta(0.1)\
.setL1(0.01)\
.setL2(0.01)\
.setTimeInterval(10)\
.setVectorSize(NUM_HASH_FEATURES)\
.linkFrom(train_stream_data);
predResult = FtrlPredictStreamOp(initModel)\
.setVectorCol(VEC_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.setReservedCols([LABEL_COL_NAME])\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\
.linkFrom(model, test_stream_data);
# predResult\
# .sample(0.0001)\
# .select("'Pred Sample' AS out_type, *")\
# .print();
predResult.print(key="predResult", refreshInterval = 30, maxLimit=20)
# In[ ]:
predResult\
.link(
EvalBinaryClassStreamOp()\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\
.setTimeInterval(10)
)\
.link(
JsonValueStreamOp()\
.setSelectedCol("Data")\
.setReservedCols(["Statistics"])\
.setOutputCols(["Accuracy", "AUC", "ConfusionMatrix"])\
.setJsonPath(["$.Accuracy", "$.AUC", "$.ConfusionMatrix"])
)\
.print(key="evaluation", refreshInterval = 30, maxLimit=20)
# .select("'Eval Metric' AS out_type, *")\
# .print();
StreamOperator.execute();
# In[ ]:
#c_6
data = CsvSourceStreamOp()\
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-ctr-train-8M.csv")\
.setSchemaStr(SCHEMA_STRING)\
.setIgnoreFirstLine(True);
feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);
spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);
train_stream_data = feature_pipelineModel.transform(spliter);
test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));
initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);
model = FtrlTrainStreamOp(initModel)\
.setVectorCol(VEC_COL_NAME)\
.setLabelCol(LABEL_COL_NAME)\
.setWithIntercept(True)\
.setAlpha(0.1)\
.setBeta(0.1)\
.setL1(0.01)\
.setL2(0.01)\
.setTimeInterval(10)\
.setVectorSize(NUM_HASH_FEATURES)\
.linkFrom(train_stream_data);
model_filter = FtrlModelFilterStreamOp()\
.setPositiveLabelValueString("1")\
.setVectorCol(VEC_COL_NAME)\
.setLabelCol(LABEL_COL_NAME)\
.setAccuracyThreshold(0.83)\
.setAucThreshold(0.71)\
.linkFrom(model, train_stream_data);
model_filter\
.select("'Model' AS out_type, *")\
.print();
predResult = FtrlPredictStreamOp(initModel)\
.setVectorCol(VEC_COL_NAME)\
.setPredictionCol(PREDICTION_COL_NAME)\
.setReservedCols([LABEL_COL_NAME])\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\
.linkFrom(model_filter, test_stream_data);
predResult\
.sample(0.0001)\
.select("'Pred Sample' AS out_type, *")\
.print();
predResult\
.link(
EvalBinaryClassStreamOp()\
.setPositiveLabelValueString("1")\
.setLabelCol(LABEL_COL_NAME)\
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)\
.setTimeInterval(10)
)\
.link(
JsonValueStreamOp()\
.setSelectedCol("Data")\
.setReservedCols(["Statistics"])\
.setOutputCols(["Accuracy", "AUC", "ConfusionMatrix"])\
.setJsonPath(["$.Accuracy", "$.AUC", "$.ConfusionMatrix"])
)\
.select("'Eval Metric' AS out_type, *")\
.print();
StreamOperator.execute();
# In[ ]:
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$